{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re, nltk, spacy, gensim, string\n",
    "\n",
    "# Sklearn\n",
    "from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import TfidfTransformer;\n",
    "from sklearn.decomposition import NMF;\n",
    "from sklearn.preprocessing import normalize\n",
    "from pprint import pprint\n",
    "\n",
    "# Plotting tools\n",
    "import pyLDAvis\n",
    "import pyLDAvis.sklearn\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('df_clean.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_text</th>\n",
       "      <th>question_lemmatize</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>how did quebec nationalists see their province...</td>\n",
       "      <td>how do quebec nationalist see -PRON- province ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do you have an adopted dog how would you encou...</td>\n",
       "      <td>do -PRON- have an adopt dog how would -PRON- e...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>why does velocity affect time does velocity af...</td>\n",
       "      <td>why do velocity affect time do velocity affect...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>how did otto von guericke used the magdeburg h...</td>\n",
       "      <td>how do otto von guericke use the magdeburg hem...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>can i convert montra helicon d to a mountain b...</td>\n",
       "      <td>can i convert montra helicon d to a mountain b...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       question_text  \\\n",
       "0  how did quebec nationalists see their province...   \n",
       "1  do you have an adopted dog how would you encou...   \n",
       "2  why does velocity affect time does velocity af...   \n",
       "3  how did otto von guericke used the magdeburg h...   \n",
       "4  can i convert montra helicon d to a mountain b...   \n",
       "\n",
       "                                  question_lemmatize  \n",
       "0  how do quebec nationalist see -PRON- province ...  \n",
       "1  do -PRON- have an adopt dog how would -PRON- e...  \n",
       "2  why do velocity affect time do velocity affect...  \n",
       "3  how do otto von guericke use the magdeburg hem...  \n",
       "4  can i convert montra helicon d to a mountain b...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['question_lemmatize_clean'] = df['question_lemmatize'].str.replace('-PRON-', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_text</th>\n",
       "      <th>question_lemmatize</th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>how did quebec nationalists see their province...</td>\n",
       "      <td>how do quebec nationalist see -PRON- province ...</td>\n",
       "      <td>how do quebec nationalist see  province as a n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do you have an adopted dog how would you encou...</td>\n",
       "      <td>do -PRON- have an adopt dog how would -PRON- e...</td>\n",
       "      <td>do  have an adopt dog how would  encourage peo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>why does velocity affect time does velocity af...</td>\n",
       "      <td>why do velocity affect time do velocity affect...</td>\n",
       "      <td>why do velocity affect time do velocity affect...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>how did otto von guericke used the magdeburg h...</td>\n",
       "      <td>how do otto von guericke use the magdeburg hem...</td>\n",
       "      <td>how do otto von guericke use the magdeburg hem...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>can i convert montra helicon d to a mountain b...</td>\n",
       "      <td>can i convert montra helicon d to a mountain b...</td>\n",
       "      <td>can i convert montra helicon d to a mountain b...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       question_text  \\\n",
       "0  how did quebec nationalists see their province...   \n",
       "1  do you have an adopted dog how would you encou...   \n",
       "2  why does velocity affect time does velocity af...   \n",
       "3  how did otto von guericke used the magdeburg h...   \n",
       "4  can i convert montra helicon d to a mountain b...   \n",
       "\n",
       "                                  question_lemmatize  \\\n",
       "0  how do quebec nationalist see -PRON- province ...   \n",
       "1  do -PRON- have an adopt dog how would -PRON- e...   \n",
       "2  why do velocity affect time do velocity affect...   \n",
       "3  how do otto von guericke use the magdeburg hem...   \n",
       "4  can i convert montra helicon d to a mountain b...   \n",
       "\n",
       "                            question_lemmatize_clean  \n",
       "0  how do quebec nationalist see  province as a n...  \n",
       "1  do  have an adopt dog how would  encourage peo...  \n",
       "2  why do velocity affect time do velocity affect...  \n",
       "3  how do otto von guericke use the magdeburg hem...  \n",
       "4  can i convert montra helicon d to a mountain b...  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
      "[nltk_data]     /home/jupyter-susan/nltk_data...\n",
      "[nltk_data]   Package averaged_perceptron_tagger is already up-to-\n",
      "[nltk_data]       date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('averaged_perceptron_tagger')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize, pos_tag\n",
    "\n",
    "def nouns(text):\n",
    "    '''Given a string of text, tokenize the text and pull out only the nouns.'''\n",
    "    is_noun = lambda pos: pos[:2] == 'NN'\n",
    "    tokenized = word_tokenize(text)\n",
    "    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] \n",
    "    return ' '.join(all_nouns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nationalist province nation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do adopt dog people</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>velocity time velocity space geometry</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>guericke magdeburg hemisphere</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>helicon d mountain bike tyre</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983796</td>\n",
       "      <td>facebook page page</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983797</td>\n",
       "      <td>something</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983798</td>\n",
       "      <td>cycle women cycle</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983799</td>\n",
       "      <td>difference currency note rs rs currency note r...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983800</td>\n",
       "      <td>form dml</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>983801 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 question_lemmatize_clean\n",
       "0                             nationalist province nation\n",
       "1                                     do adopt dog people\n",
       "2                   velocity time velocity space geometry\n",
       "3                           guericke magdeburg hemisphere\n",
       "4                            helicon d mountain bike tyre\n",
       "...                                                   ...\n",
       "983796                                 facebook page page\n",
       "983797                                          something\n",
       "983798                                  cycle women cycle\n",
       "983799  difference currency note rs rs currency note r...\n",
       "983800                                           form dml\n",
       "\n",
       "[983801 rows x 1 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_nouns = pd.DataFrame(df.question_lemmatize_clean.apply(nouns))\n",
    "df_nouns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_nouns.to_csv('df_nouns.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_nouns = pd.read_csv('df_nouns.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nationalist province nation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do adopt dog people</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>velocity time velocity space geometry</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>guericke magdeburg hemisphere</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>helicon d mountain bike tyre</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                question_lemmatize_clean\n",
       "0            nationalist province nation\n",
       "1                    do adopt dog people\n",
       "2  velocity time velocity space geometry\n",
       "3          guericke magdeburg hemisphere\n",
       "4           helicon d mountain bike tyre"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_nouns.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_features = 4000\n",
    "n_components = 20\n",
    "n_top_words = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ignore terms that have a document frequency strictly higher than 95%, \n",
    "# ignore terms that have a document frequency strictly lower than 2\n",
    "tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,\n",
    "                                   max_features=n_features,\n",
    "                                   stop_words='english')\n",
    "tfidf = tfidf_vectorizer.fit_transform(df_nouns['question_lemmatize_clean'].values.astype(str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# alpha=0 means no regularization, l1_ratio=.5, the penalty is a combination of L1 and L2\n",
    "nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "nmf_output = nmf.fit_transform(tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Word 0</th>\n",
       "      <th>Word 1</th>\n",
       "      <th>Word 2</th>\n",
       "      <th>Word 3</th>\n",
       "      <th>Word 4</th>\n",
       "      <th>Word 5</th>\n",
       "      <th>Word 6</th>\n",
       "      <th>Word 7</th>\n",
       "      <th>Word 8</th>\n",
       "      <th>Word 9</th>\n",
       "      <th>Word 10</th>\n",
       "      <th>Word 11</th>\n",
       "      <th>Word 12</th>\n",
       "      <th>Word 13</th>\n",
       "      <th>Word 14</th>\n",
       "      <th>Word 15</th>\n",
       "      <th>Word 16</th>\n",
       "      <th>Word 17</th>\n",
       "      <th>Word 18</th>\n",
       "      <th>Word 19</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>Topic 0</td>\n",
       "      <td>nan</td>\n",
       "      <td>ab</td>\n",
       "      <td>phd</td>\n",
       "      <td>phenomenon</td>\n",
       "      <td>philippine</td>\n",
       "      <td>philosopher</td>\n",
       "      <td>philosophy</td>\n",
       "      <td>phone</td>\n",
       "      <td>photo</td>\n",
       "      <td>photograph</td>\n",
       "      <td>photographer</td>\n",
       "      <td>photography</td>\n",
       "      <td>photon</td>\n",
       "      <td>photoshop</td>\n",
       "      <td>photosynthesis</td>\n",
       "      <td>php</td>\n",
       "      <td>physic</td>\n",
       "      <td>physician</td>\n",
       "      <td>physicist</td>\n",
       "      <td>physics</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 1</td>\n",
       "      <td>people</td>\n",
       "      <td>reason</td>\n",
       "      <td>money</td>\n",
       "      <td>group</td>\n",
       "      <td>friend</td>\n",
       "      <td>lot</td>\n",
       "      <td>relationship</td>\n",
       "      <td>word</td>\n",
       "      <td>problem</td>\n",
       "      <td>age</td>\n",
       "      <td>government</td>\n",
       "      <td>kind</td>\n",
       "      <td>religion</td>\n",
       "      <td>talk</td>\n",
       "      <td>think</td>\n",
       "      <td>state</td>\n",
       "      <td>love</td>\n",
       "      <td>family</td>\n",
       "      <td>look</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 2</td>\n",
       "      <td>life</td>\n",
       "      <td>change</td>\n",
       "      <td>moment</td>\n",
       "      <td>meaning</td>\n",
       "      <td>experience</td>\n",
       "      <td>purpose</td>\n",
       "      <td>example</td>\n",
       "      <td>point</td>\n",
       "      <td>love</td>\n",
       "      <td>rest</td>\n",
       "      <td>lesson</td>\n",
       "      <td>death</td>\n",
       "      <td>event</td>\n",
       "      <td>story</td>\n",
       "      <td>goal</td>\n",
       "      <td>friend</td>\n",
       "      <td>dream</td>\n",
       "      <td>earth</td>\n",
       "      <td>movie</td>\n",
       "      <td>parent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 3</td>\n",
       "      <td>way</td>\n",
       "      <td>money</td>\n",
       "      <td>online</td>\n",
       "      <td>language</td>\n",
       "      <td>business</td>\n",
       "      <td>home</td>\n",
       "      <td>girl</td>\n",
       "      <td>friend</td>\n",
       "      <td>weight</td>\n",
       "      <td>number</td>\n",
       "      <td>website</td>\n",
       "      <td>app</td>\n",
       "      <td>child</td>\n",
       "      <td>month</td>\n",
       "      <td>market</td>\n",
       "      <td>account</td>\n",
       "      <td>phone</td>\n",
       "      <td>guy</td>\n",
       "      <td>company</td>\n",
       "      <td>hair</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 4</td>\n",
       "      <td>thing</td>\n",
       "      <td>weekend</td>\n",
       "      <td>parent</td>\n",
       "      <td>today</td>\n",
       "      <td>mind</td>\n",
       "      <td>child</td>\n",
       "      <td>friend</td>\n",
       "      <td>need</td>\n",
       "      <td>love</td>\n",
       "      <td>guy</td>\n",
       "      <td>relationship</td>\n",
       "      <td>internet</td>\n",
       "      <td>girl</td>\n",
       "      <td>happen</td>\n",
       "      <td>learn</td>\n",
       "      <td>teacher</td>\n",
       "      <td>lot</td>\n",
       "      <td>kind</td>\n",
       "      <td>family</td>\n",
       "      <td>money</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 5</td>\n",
       "      <td>time</td>\n",
       "      <td>period</td>\n",
       "      <td>travel</td>\n",
       "      <td>sex</td>\n",
       "      <td>movie</td>\n",
       "      <td>friend</td>\n",
       "      <td>waste</td>\n",
       "      <td>money</td>\n",
       "      <td>month</td>\n",
       "      <td>space</td>\n",
       "      <td>girl</td>\n",
       "      <td>week</td>\n",
       "      <td>change</td>\n",
       "      <td>study</td>\n",
       "      <td>lot</td>\n",
       "      <td>relationship</td>\n",
       "      <td>place</td>\n",
       "      <td>hour</td>\n",
       "      <td>guy</td>\n",
       "      <td>number</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 6</td>\n",
       "      <td>use</td>\n",
       "      <td>language</td>\n",
       "      <td>word</td>\n",
       "      <td>app</td>\n",
       "      <td>type</td>\n",
       "      <td>technique</td>\n",
       "      <td>phone</td>\n",
       "      <td>device</td>\n",
       "      <td>software</td>\n",
       "      <td>number</td>\n",
       "      <td>car</td>\n",
       "      <td>water</td>\n",
       "      <td>method</td>\n",
       "      <td>tool</td>\n",
       "      <td>website</td>\n",
       "      <td>oil</td>\n",
       "      <td>advantage</td>\n",
       "      <td>datum</td>\n",
       "      <td>technology</td>\n",
       "      <td>company</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 7</td>\n",
       "      <td>year</td>\n",
       "      <td>girl</td>\n",
       "      <td>experience</td>\n",
       "      <td>age</td>\n",
       "      <td>date</td>\n",
       "      <td>guy</td>\n",
       "      <td>relationship</td>\n",
       "      <td>boy</td>\n",
       "      <td>month</td>\n",
       "      <td>change</td>\n",
       "      <td>engineering</td>\n",
       "      <td>money</td>\n",
       "      <td>exam</td>\n",
       "      <td>drop</td>\n",
       "      <td>start</td>\n",
       "      <td>company</td>\n",
       "      <td>paper</td>\n",
       "      <td>gap</td>\n",
       "      <td>course</td>\n",
       "      <td>parent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 8</td>\n",
       "      <td>india</td>\n",
       "      <td>company</td>\n",
       "      <td>state</td>\n",
       "      <td>money</td>\n",
       "      <td>business</td>\n",
       "      <td>place</td>\n",
       "      <td>scope</td>\n",
       "      <td>cost</td>\n",
       "      <td>government</td>\n",
       "      <td>course</td>\n",
       "      <td>china</td>\n",
       "      <td>pakistan</td>\n",
       "      <td>service</td>\n",
       "      <td>car</td>\n",
       "      <td>online</td>\n",
       "      <td>minister</td>\n",
       "      <td>product</td>\n",
       "      <td>city</td>\n",
       "      <td>college</td>\n",
       "      <td>bank</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 9</td>\n",
       "      <td>person</td>\n",
       "      <td>friend</td>\n",
       "      <td>love</td>\n",
       "      <td>relationship</td>\n",
       "      <td>kind</td>\n",
       "      <td>date</td>\n",
       "      <td>dream</td>\n",
       "      <td>message</td>\n",
       "      <td>history</td>\n",
       "      <td>talk</td>\n",
       "      <td>number</td>\n",
       "      <td>personality</td>\n",
       "      <td>reason</td>\n",
       "      <td>word</td>\n",
       "      <td>phone</td>\n",
       "      <td>money</td>\n",
       "      <td>change</td>\n",
       "      <td>age</td>\n",
       "      <td>sex</td>\n",
       "      <td>type</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 10</td>\n",
       "      <td>difference</td>\n",
       "      <td>similarity</td>\n",
       "      <td>state</td>\n",
       "      <td>word</td>\n",
       "      <td>term</td>\n",
       "      <td>language</td>\n",
       "      <td>vs</td>\n",
       "      <td>engineering</td>\n",
       "      <td>number</td>\n",
       "      <td>computer</td>\n",
       "      <td>cell</td>\n",
       "      <td>science</td>\n",
       "      <td>culture</td>\n",
       "      <td>example</td>\n",
       "      <td>force</td>\n",
       "      <td>love</td>\n",
       "      <td>power</td>\n",
       "      <td>point</td>\n",
       "      <td>type</td>\n",
       "      <td>function</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 11</td>\n",
       "      <td>country</td>\n",
       "      <td>state</td>\n",
       "      <td>china</td>\n",
       "      <td>war</td>\n",
       "      <td>government</td>\n",
       "      <td>europe</td>\n",
       "      <td>citizen</td>\n",
       "      <td>language</td>\n",
       "      <td>america</td>\n",
       "      <td>compare</td>\n",
       "      <td>law</td>\n",
       "      <td>population</td>\n",
       "      <td>culture</td>\n",
       "      <td>economy</td>\n",
       "      <td>canada</td>\n",
       "      <td>russia</td>\n",
       "      <td>rate</td>\n",
       "      <td>africa</td>\n",
       "      <td>visa</td>\n",
       "      <td>travel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 12</td>\n",
       "      <td>work</td>\n",
       "      <td>tip</td>\n",
       "      <td>company</td>\n",
       "      <td>experience</td>\n",
       "      <td>home</td>\n",
       "      <td>engineer</td>\n",
       "      <td>software</td>\n",
       "      <td>hour</td>\n",
       "      <td>business</td>\n",
       "      <td>visa</td>\n",
       "      <td>service</td>\n",
       "      <td>employee</td>\n",
       "      <td>project</td>\n",
       "      <td>industry</td>\n",
       "      <td>field</td>\n",
       "      <td>money</td>\n",
       "      <td>bank</td>\n",
       "      <td>canada</td>\n",
       "      <td>developer</td>\n",
       "      <td>permit</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 13</td>\n",
       "      <td>book</td>\n",
       "      <td>movie</td>\n",
       "      <td>preparation</td>\n",
       "      <td>class</td>\n",
       "      <td>device</td>\n",
       "      <td>exam</td>\n",
       "      <td>history</td>\n",
       "      <td>character</td>\n",
       "      <td>read</td>\n",
       "      <td>inspire</td>\n",
       "      <td>jee</td>\n",
       "      <td>theme</td>\n",
       "      <td>beginner</td>\n",
       "      <td>physics</td>\n",
       "      <td>chemistry</td>\n",
       "      <td>story</td>\n",
       "      <td>science</td>\n",
       "      <td>series</td>\n",
       "      <td>setting</td>\n",
       "      <td>math</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 14</td>\n",
       "      <td>job</td>\n",
       "      <td>company</td>\n",
       "      <td>engineering</td>\n",
       "      <td>engineer</td>\n",
       "      <td>experience</td>\n",
       "      <td>opportunity</td>\n",
       "      <td>government</td>\n",
       "      <td>pay</td>\n",
       "      <td>degree</td>\n",
       "      <td>software</td>\n",
       "      <td>canada</td>\n",
       "      <td>interview</td>\n",
       "      <td>course</td>\n",
       "      <td>money</td>\n",
       "      <td>kind</td>\n",
       "      <td>graduate</td>\n",
       "      <td>computer</td>\n",
       "      <td>field</td>\n",
       "      <td>career</td>\n",
       "      <td>science</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 15</td>\n",
       "      <td>woman</td>\n",
       "      <td>man</td>\n",
       "      <td>girl</td>\n",
       "      <td>sex</td>\n",
       "      <td>guy</td>\n",
       "      <td>friend</td>\n",
       "      <td>date</td>\n",
       "      <td>relationship</td>\n",
       "      <td>love</td>\n",
       "      <td>age</td>\n",
       "      <td>wife</td>\n",
       "      <td>look</td>\n",
       "      <td>child</td>\n",
       "      <td>marriage</td>\n",
       "      <td>hair</td>\n",
       "      <td>husband</td>\n",
       "      <td>boy</td>\n",
       "      <td>marry</td>\n",
       "      <td>movie</td>\n",
       "      <td>body</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 16</td>\n",
       "      <td>world</td>\n",
       "      <td>war</td>\n",
       "      <td>place</td>\n",
       "      <td>cup</td>\n",
       "      <td>today</td>\n",
       "      <td>language</td>\n",
       "      <td>history</td>\n",
       "      <td>look</td>\n",
       "      <td>city</td>\n",
       "      <td>change</td>\n",
       "      <td>end</td>\n",
       "      <td>rest</td>\n",
       "      <td>power</td>\n",
       "      <td>company</td>\n",
       "      <td>population</td>\n",
       "      <td>state</td>\n",
       "      <td>ii</td>\n",
       "      <td>money</td>\n",
       "      <td>leader</td>\n",
       "      <td>game</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 17</td>\n",
       "      <td>day</td>\n",
       "      <td>period</td>\n",
       "      <td>hour</td>\n",
       "      <td>week</td>\n",
       "      <td>month</td>\n",
       "      <td>sex</td>\n",
       "      <td>place</td>\n",
       "      <td>night</td>\n",
       "      <td>water</td>\n",
       "      <td>exam</td>\n",
       "      <td>trip</td>\n",
       "      <td>today</td>\n",
       "      <td>weight</td>\n",
       "      <td>test</td>\n",
       "      <td>eat</td>\n",
       "      <td>study</td>\n",
       "      <td>cause</td>\n",
       "      <td>food</td>\n",
       "      <td>body</td>\n",
       "      <td>plan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 18</td>\n",
       "      <td>school</td>\n",
       "      <td>student</td>\n",
       "      <td>college</td>\n",
       "      <td>university</td>\n",
       "      <td>engineering</td>\n",
       "      <td>class</td>\n",
       "      <td>study</td>\n",
       "      <td>science</td>\n",
       "      <td>course</td>\n",
       "      <td>business</td>\n",
       "      <td>computer</td>\n",
       "      <td>friend</td>\n",
       "      <td>state</td>\n",
       "      <td>exam</td>\n",
       "      <td>money</td>\n",
       "      <td>admission</td>\n",
       "      <td>language</td>\n",
       "      <td>mark</td>\n",
       "      <td>rank</td>\n",
       "      <td>program</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 19</td>\n",
       "      <td>question</td>\n",
       "      <td>quora</td>\n",
       "      <td>answer</td>\n",
       "      <td>interview</td>\n",
       "      <td>ask</td>\n",
       "      <td>topic</td>\n",
       "      <td>paper</td>\n",
       "      <td>type</td>\n",
       "      <td>account</td>\n",
       "      <td>number</td>\n",
       "      <td>post</td>\n",
       "      <td>comment</td>\n",
       "      <td>view</td>\n",
       "      <td>writer</td>\n",
       "      <td>site</td>\n",
       "      <td>user</td>\n",
       "      <td>exam</td>\n",
       "      <td>kind</td>\n",
       "      <td>feed</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              Word 0      Word 1       Word 2        Word 3       Word 4  \\\n",
       "Topic 0          nan          ab          phd    phenomenon   philippine   \n",
       "Topic 1       people      reason        money         group       friend   \n",
       "Topic 2         life      change       moment       meaning   experience   \n",
       "Topic 3          way       money       online      language     business   \n",
       "Topic 4        thing     weekend       parent         today         mind   \n",
       "Topic 5         time      period       travel           sex        movie   \n",
       "Topic 6          use    language         word           app         type   \n",
       "Topic 7         year        girl   experience           age         date   \n",
       "Topic 8        india     company        state         money     business   \n",
       "Topic 9       person      friend         love  relationship         kind   \n",
       "Topic 10  difference  similarity        state          word         term   \n",
       "Topic 11     country       state        china           war   government   \n",
       "Topic 12        work         tip      company    experience         home   \n",
       "Topic 13        book       movie  preparation         class       device   \n",
       "Topic 14         job     company  engineering      engineer   experience   \n",
       "Topic 15       woman         man         girl           sex          guy   \n",
       "Topic 16       world         war        place           cup        today   \n",
       "Topic 17         day      period         hour          week        month   \n",
       "Topic 18      school     student      college    university  engineering   \n",
       "Topic 19    question       quora       answer     interview          ask   \n",
       "\n",
       "               Word 5        Word 6        Word 7      Word 8      Word 9  \\\n",
       "Topic 0   philosopher    philosophy         phone       photo  photograph   \n",
       "Topic 1           lot  relationship          word     problem         age   \n",
       "Topic 2       purpose       example         point        love        rest   \n",
       "Topic 3          home          girl        friend      weight      number   \n",
       "Topic 4         child        friend          need        love         guy   \n",
       "Topic 5        friend         waste         money       month       space   \n",
       "Topic 6     technique         phone        device    software      number   \n",
       "Topic 7           guy  relationship           boy       month      change   \n",
       "Topic 8         place         scope          cost  government      course   \n",
       "Topic 9          date         dream       message     history        talk   \n",
       "Topic 10     language            vs   engineering      number    computer   \n",
       "Topic 11       europe       citizen      language     america     compare   \n",
       "Topic 12     engineer      software          hour    business        visa   \n",
       "Topic 13         exam       history     character        read     inspire   \n",
       "Topic 14  opportunity    government           pay      degree    software   \n",
       "Topic 15       friend          date  relationship        love         age   \n",
       "Topic 16     language       history          look        city      change   \n",
       "Topic 17          sex         place         night       water        exam   \n",
       "Topic 18        class         study       science      course    business   \n",
       "Topic 19        topic         paper          type     account      number   \n",
       "\n",
       "               Word 10      Word 11   Word 12    Word 13         Word 14  \\\n",
       "Topic 0   photographer  photography    photon  photoshop  photosynthesis   \n",
       "Topic 1     government         kind  religion       talk           think   \n",
       "Topic 2         lesson        death     event      story            goal   \n",
       "Topic 3        website          app     child      month          market   \n",
       "Topic 4   relationship     internet      girl     happen           learn   \n",
       "Topic 5           girl         week    change      study             lot   \n",
       "Topic 6            car        water    method       tool         website   \n",
       "Topic 7    engineering        money      exam       drop           start   \n",
       "Topic 8          china     pakistan   service        car          online   \n",
       "Topic 9         number  personality    reason       word           phone   \n",
       "Topic 10          cell      science   culture    example           force   \n",
       "Topic 11           law   population   culture    economy          canada   \n",
       "Topic 12       service     employee   project   industry           field   \n",
       "Topic 13           jee        theme  beginner    physics       chemistry   \n",
       "Topic 14        canada    interview    course      money            kind   \n",
       "Topic 15          wife         look     child   marriage            hair   \n",
       "Topic 16           end         rest     power    company      population   \n",
       "Topic 17          trip        today    weight       test             eat   \n",
       "Topic 18      computer       friend     state       exam           money   \n",
       "Topic 19          post      comment      view     writer            site   \n",
       "\n",
       "               Word 15    Word 16    Word 17     Word 18   Word 19  \n",
       "Topic 0            php     physic  physician   physicist   physics  \n",
       "Topic 1          state       love     family        look    medium  \n",
       "Topic 2         friend      dream      earth       movie    parent  \n",
       "Topic 3        account      phone        guy     company      hair  \n",
       "Topic 4        teacher        lot       kind      family     money  \n",
       "Topic 5   relationship      place       hour         guy    number  \n",
       "Topic 6            oil  advantage      datum  technology   company  \n",
       "Topic 7        company      paper        gap      course    parent  \n",
       "Topic 8       minister    product       city     college      bank  \n",
       "Topic 9          money     change        age         sex      type  \n",
       "Topic 10          love      power      point        type  function  \n",
       "Topic 11        russia       rate     africa        visa    travel  \n",
       "Topic 12         money       bank     canada   developer    permit  \n",
       "Topic 13         story    science     series     setting      math  \n",
       "Topic 14      graduate   computer      field      career   science  \n",
       "Topic 15       husband        boy      marry       movie      body  \n",
       "Topic 16         state         ii      money      leader      game  \n",
       "Topic 17         study      cause       food        body      plan  \n",
       "Topic 18     admission   language       mark        rank   program  \n",
       "Topic 19          user       exam       kind        feed      test  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20):\n",
    "    keywords = np.array(vectorizer.get_feature_names())\n",
    "    topic_keywords = []\n",
    "    for topic_weights in lda_model.components_:\n",
    "        top_keyword_locs = (-topic_weights).argsort()[:n_words]\n",
    "        topic_keywords.append(keywords.take(top_keyword_locs))\n",
    "    return topic_keywords\n",
    "\n",
    "topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20)        \n",
    "\n",
    "# Topic - Keywords Dataframe\n",
    "df_topic_keywords = pd.DataFrame(topic_keywords)\n",
    "df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]\n",
    "df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]\n",
    "df_topic_keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "Topics_theme = ['Word start from ph', 'People/Friend/Relationship', 'Life/Experience/Love/Purpose', 'Money/Internet/Business', \n",
    "                'Weekend/Parent/Child', 'Leisure time', 'Language/technique/software', 'Relationship/Girl/Boy', \n",
    "                'Business relate to India, China or Pakistan', 'Friend/Love/Relationship', 'Difference and similarity/Language/Engineering', \n",
    "                'Culture, travel and visa requirements in several countries', 'Tips on working as software engineering', 'Book/Movie/Class/History/Physics/Chemistry/Science', \n",
    "                'Software engineer job opportunitis in Canada', 'Love/Life/Relationship', 'World/War/Language/History', 'Day/Hour/Week/Month/Sex/Place', 'School/Student/College/University', \n",
    "                'Question/Answer/Quora/Interview']\n",
    "df_topic_keywords['topic_theme'] = Topics_theme"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Word 0</th>\n",
       "      <th>Word 1</th>\n",
       "      <th>Word 2</th>\n",
       "      <th>Word 3</th>\n",
       "      <th>Word 4</th>\n",
       "      <th>Word 5</th>\n",
       "      <th>Word 6</th>\n",
       "      <th>Word 7</th>\n",
       "      <th>Word 8</th>\n",
       "      <th>Word 9</th>\n",
       "      <th>...</th>\n",
       "      <th>Word 11</th>\n",
       "      <th>Word 12</th>\n",
       "      <th>Word 13</th>\n",
       "      <th>Word 14</th>\n",
       "      <th>Word 15</th>\n",
       "      <th>Word 16</th>\n",
       "      <th>Word 17</th>\n",
       "      <th>Word 18</th>\n",
       "      <th>Word 19</th>\n",
       "      <th>topic_theme</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>Topic 0</td>\n",
       "      <td>nan</td>\n",
       "      <td>ab</td>\n",
       "      <td>phd</td>\n",
       "      <td>phenomenon</td>\n",
       "      <td>philippine</td>\n",
       "      <td>philosopher</td>\n",
       "      <td>philosophy</td>\n",
       "      <td>phone</td>\n",
       "      <td>photo</td>\n",
       "      <td>photograph</td>\n",
       "      <td>...</td>\n",
       "      <td>photography</td>\n",
       "      <td>photon</td>\n",
       "      <td>photoshop</td>\n",
       "      <td>photosynthesis</td>\n",
       "      <td>php</td>\n",
       "      <td>physic</td>\n",
       "      <td>physician</td>\n",
       "      <td>physicist</td>\n",
       "      <td>physics</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 1</td>\n",
       "      <td>people</td>\n",
       "      <td>reason</td>\n",
       "      <td>money</td>\n",
       "      <td>group</td>\n",
       "      <td>friend</td>\n",
       "      <td>lot</td>\n",
       "      <td>relationship</td>\n",
       "      <td>word</td>\n",
       "      <td>problem</td>\n",
       "      <td>age</td>\n",
       "      <td>...</td>\n",
       "      <td>kind</td>\n",
       "      <td>religion</td>\n",
       "      <td>talk</td>\n",
       "      <td>think</td>\n",
       "      <td>state</td>\n",
       "      <td>love</td>\n",
       "      <td>family</td>\n",
       "      <td>look</td>\n",
       "      <td>medium</td>\n",
       "      <td>People/Friend/Relationship</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 2</td>\n",
       "      <td>life</td>\n",
       "      <td>change</td>\n",
       "      <td>moment</td>\n",
       "      <td>meaning</td>\n",
       "      <td>experience</td>\n",
       "      <td>purpose</td>\n",
       "      <td>example</td>\n",
       "      <td>point</td>\n",
       "      <td>love</td>\n",
       "      <td>rest</td>\n",
       "      <td>...</td>\n",
       "      <td>death</td>\n",
       "      <td>event</td>\n",
       "      <td>story</td>\n",
       "      <td>goal</td>\n",
       "      <td>friend</td>\n",
       "      <td>dream</td>\n",
       "      <td>earth</td>\n",
       "      <td>movie</td>\n",
       "      <td>parent</td>\n",
       "      <td>Life/Experience/Love/Purpose</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 3</td>\n",
       "      <td>way</td>\n",
       "      <td>money</td>\n",
       "      <td>online</td>\n",
       "      <td>language</td>\n",
       "      <td>business</td>\n",
       "      <td>home</td>\n",
       "      <td>girl</td>\n",
       "      <td>friend</td>\n",
       "      <td>weight</td>\n",
       "      <td>number</td>\n",
       "      <td>...</td>\n",
       "      <td>app</td>\n",
       "      <td>child</td>\n",
       "      <td>month</td>\n",
       "      <td>market</td>\n",
       "      <td>account</td>\n",
       "      <td>phone</td>\n",
       "      <td>guy</td>\n",
       "      <td>company</td>\n",
       "      <td>hair</td>\n",
       "      <td>Money/Internet/Business</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 4</td>\n",
       "      <td>thing</td>\n",
       "      <td>weekend</td>\n",
       "      <td>parent</td>\n",
       "      <td>today</td>\n",
       "      <td>mind</td>\n",
       "      <td>child</td>\n",
       "      <td>friend</td>\n",
       "      <td>need</td>\n",
       "      <td>love</td>\n",
       "      <td>guy</td>\n",
       "      <td>...</td>\n",
       "      <td>internet</td>\n",
       "      <td>girl</td>\n",
       "      <td>happen</td>\n",
       "      <td>learn</td>\n",
       "      <td>teacher</td>\n",
       "      <td>lot</td>\n",
       "      <td>kind</td>\n",
       "      <td>family</td>\n",
       "      <td>money</td>\n",
       "      <td>Weekend/Parent/Child</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 5</td>\n",
       "      <td>time</td>\n",
       "      <td>period</td>\n",
       "      <td>travel</td>\n",
       "      <td>sex</td>\n",
       "      <td>movie</td>\n",
       "      <td>friend</td>\n",
       "      <td>waste</td>\n",
       "      <td>money</td>\n",
       "      <td>month</td>\n",
       "      <td>space</td>\n",
       "      <td>...</td>\n",
       "      <td>week</td>\n",
       "      <td>change</td>\n",
       "      <td>study</td>\n",
       "      <td>lot</td>\n",
       "      <td>relationship</td>\n",
       "      <td>place</td>\n",
       "      <td>hour</td>\n",
       "      <td>guy</td>\n",
       "      <td>number</td>\n",
       "      <td>Leisure time</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 6</td>\n",
       "      <td>use</td>\n",
       "      <td>language</td>\n",
       "      <td>word</td>\n",
       "      <td>app</td>\n",
       "      <td>type</td>\n",
       "      <td>technique</td>\n",
       "      <td>phone</td>\n",
       "      <td>device</td>\n",
       "      <td>software</td>\n",
       "      <td>number</td>\n",
       "      <td>...</td>\n",
       "      <td>water</td>\n",
       "      <td>method</td>\n",
       "      <td>tool</td>\n",
       "      <td>website</td>\n",
       "      <td>oil</td>\n",
       "      <td>advantage</td>\n",
       "      <td>datum</td>\n",
       "      <td>technology</td>\n",
       "      <td>company</td>\n",
       "      <td>Language/technique/software</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 7</td>\n",
       "      <td>year</td>\n",
       "      <td>girl</td>\n",
       "      <td>experience</td>\n",
       "      <td>age</td>\n",
       "      <td>date</td>\n",
       "      <td>guy</td>\n",
       "      <td>relationship</td>\n",
       "      <td>boy</td>\n",
       "      <td>month</td>\n",
       "      <td>change</td>\n",
       "      <td>...</td>\n",
       "      <td>money</td>\n",
       "      <td>exam</td>\n",
       "      <td>drop</td>\n",
       "      <td>start</td>\n",
       "      <td>company</td>\n",
       "      <td>paper</td>\n",
       "      <td>gap</td>\n",
       "      <td>course</td>\n",
       "      <td>parent</td>\n",
       "      <td>Relationship/Girl/Boy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 8</td>\n",
       "      <td>india</td>\n",
       "      <td>company</td>\n",
       "      <td>state</td>\n",
       "      <td>money</td>\n",
       "      <td>business</td>\n",
       "      <td>place</td>\n",
       "      <td>scope</td>\n",
       "      <td>cost</td>\n",
       "      <td>government</td>\n",
       "      <td>course</td>\n",
       "      <td>...</td>\n",
       "      <td>pakistan</td>\n",
       "      <td>service</td>\n",
       "      <td>car</td>\n",
       "      <td>online</td>\n",
       "      <td>minister</td>\n",
       "      <td>product</td>\n",
       "      <td>city</td>\n",
       "      <td>college</td>\n",
       "      <td>bank</td>\n",
       "      <td>Business relate to India, China or Pakistan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 9</td>\n",
       "      <td>person</td>\n",
       "      <td>friend</td>\n",
       "      <td>love</td>\n",
       "      <td>relationship</td>\n",
       "      <td>kind</td>\n",
       "      <td>date</td>\n",
       "      <td>dream</td>\n",
       "      <td>message</td>\n",
       "      <td>history</td>\n",
       "      <td>talk</td>\n",
       "      <td>...</td>\n",
       "      <td>personality</td>\n",
       "      <td>reason</td>\n",
       "      <td>word</td>\n",
       "      <td>phone</td>\n",
       "      <td>money</td>\n",
       "      <td>change</td>\n",
       "      <td>age</td>\n",
       "      <td>sex</td>\n",
       "      <td>type</td>\n",
       "      <td>Friend/Love/Relationship</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 10</td>\n",
       "      <td>difference</td>\n",
       "      <td>similarity</td>\n",
       "      <td>state</td>\n",
       "      <td>word</td>\n",
       "      <td>term</td>\n",
       "      <td>language</td>\n",
       "      <td>vs</td>\n",
       "      <td>engineering</td>\n",
       "      <td>number</td>\n",
       "      <td>computer</td>\n",
       "      <td>...</td>\n",
       "      <td>science</td>\n",
       "      <td>culture</td>\n",
       "      <td>example</td>\n",
       "      <td>force</td>\n",
       "      <td>love</td>\n",
       "      <td>power</td>\n",
       "      <td>point</td>\n",
       "      <td>type</td>\n",
       "      <td>function</td>\n",
       "      <td>Difference and similarity/Language/Engineering</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 11</td>\n",
       "      <td>country</td>\n",
       "      <td>state</td>\n",
       "      <td>china</td>\n",
       "      <td>war</td>\n",
       "      <td>government</td>\n",
       "      <td>europe</td>\n",
       "      <td>citizen</td>\n",
       "      <td>language</td>\n",
       "      <td>america</td>\n",
       "      <td>compare</td>\n",
       "      <td>...</td>\n",
       "      <td>population</td>\n",
       "      <td>culture</td>\n",
       "      <td>economy</td>\n",
       "      <td>canada</td>\n",
       "      <td>russia</td>\n",
       "      <td>rate</td>\n",
       "      <td>africa</td>\n",
       "      <td>visa</td>\n",
       "      <td>travel</td>\n",
       "      <td>Culture, travel and visa requirements in sever...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 12</td>\n",
       "      <td>work</td>\n",
       "      <td>tip</td>\n",
       "      <td>company</td>\n",
       "      <td>experience</td>\n",
       "      <td>home</td>\n",
       "      <td>engineer</td>\n",
       "      <td>software</td>\n",
       "      <td>hour</td>\n",
       "      <td>business</td>\n",
       "      <td>visa</td>\n",
       "      <td>...</td>\n",
       "      <td>employee</td>\n",
       "      <td>project</td>\n",
       "      <td>industry</td>\n",
       "      <td>field</td>\n",
       "      <td>money</td>\n",
       "      <td>bank</td>\n",
       "      <td>canada</td>\n",
       "      <td>developer</td>\n",
       "      <td>permit</td>\n",
       "      <td>Tips on working as software engineering</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 13</td>\n",
       "      <td>book</td>\n",
       "      <td>movie</td>\n",
       "      <td>preparation</td>\n",
       "      <td>class</td>\n",
       "      <td>device</td>\n",
       "      <td>exam</td>\n",
       "      <td>history</td>\n",
       "      <td>character</td>\n",
       "      <td>read</td>\n",
       "      <td>inspire</td>\n",
       "      <td>...</td>\n",
       "      <td>theme</td>\n",
       "      <td>beginner</td>\n",
       "      <td>physics</td>\n",
       "      <td>chemistry</td>\n",
       "      <td>story</td>\n",
       "      <td>science</td>\n",
       "      <td>series</td>\n",
       "      <td>setting</td>\n",
       "      <td>math</td>\n",
       "      <td>Book/Movie/Class/History/Physics/Chemistry/Sci...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 14</td>\n",
       "      <td>job</td>\n",
       "      <td>company</td>\n",
       "      <td>engineering</td>\n",
       "      <td>engineer</td>\n",
       "      <td>experience</td>\n",
       "      <td>opportunity</td>\n",
       "      <td>government</td>\n",
       "      <td>pay</td>\n",
       "      <td>degree</td>\n",
       "      <td>software</td>\n",
       "      <td>...</td>\n",
       "      <td>interview</td>\n",
       "      <td>course</td>\n",
       "      <td>money</td>\n",
       "      <td>kind</td>\n",
       "      <td>graduate</td>\n",
       "      <td>computer</td>\n",
       "      <td>field</td>\n",
       "      <td>career</td>\n",
       "      <td>science</td>\n",
       "      <td>Software engineer job opportunitis in Canada</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 15</td>\n",
       "      <td>woman</td>\n",
       "      <td>man</td>\n",
       "      <td>girl</td>\n",
       "      <td>sex</td>\n",
       "      <td>guy</td>\n",
       "      <td>friend</td>\n",
       "      <td>date</td>\n",
       "      <td>relationship</td>\n",
       "      <td>love</td>\n",
       "      <td>age</td>\n",
       "      <td>...</td>\n",
       "      <td>look</td>\n",
       "      <td>child</td>\n",
       "      <td>marriage</td>\n",
       "      <td>hair</td>\n",
       "      <td>husband</td>\n",
       "      <td>boy</td>\n",
       "      <td>marry</td>\n",
       "      <td>movie</td>\n",
       "      <td>body</td>\n",
       "      <td>Love/Life/Relationship</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 16</td>\n",
       "      <td>world</td>\n",
       "      <td>war</td>\n",
       "      <td>place</td>\n",
       "      <td>cup</td>\n",
       "      <td>today</td>\n",
       "      <td>language</td>\n",
       "      <td>history</td>\n",
       "      <td>look</td>\n",
       "      <td>city</td>\n",
       "      <td>change</td>\n",
       "      <td>...</td>\n",
       "      <td>rest</td>\n",
       "      <td>power</td>\n",
       "      <td>company</td>\n",
       "      <td>population</td>\n",
       "      <td>state</td>\n",
       "      <td>ii</td>\n",
       "      <td>money</td>\n",
       "      <td>leader</td>\n",
       "      <td>game</td>\n",
       "      <td>World/War/Language/History</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 17</td>\n",
       "      <td>day</td>\n",
       "      <td>period</td>\n",
       "      <td>hour</td>\n",
       "      <td>week</td>\n",
       "      <td>month</td>\n",
       "      <td>sex</td>\n",
       "      <td>place</td>\n",
       "      <td>night</td>\n",
       "      <td>water</td>\n",
       "      <td>exam</td>\n",
       "      <td>...</td>\n",
       "      <td>today</td>\n",
       "      <td>weight</td>\n",
       "      <td>test</td>\n",
       "      <td>eat</td>\n",
       "      <td>study</td>\n",
       "      <td>cause</td>\n",
       "      <td>food</td>\n",
       "      <td>body</td>\n",
       "      <td>plan</td>\n",
       "      <td>Day/Hour/Week/Month/Sex/Place</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 18</td>\n",
       "      <td>school</td>\n",
       "      <td>student</td>\n",
       "      <td>college</td>\n",
       "      <td>university</td>\n",
       "      <td>engineering</td>\n",
       "      <td>class</td>\n",
       "      <td>study</td>\n",
       "      <td>science</td>\n",
       "      <td>course</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>friend</td>\n",
       "      <td>state</td>\n",
       "      <td>exam</td>\n",
       "      <td>money</td>\n",
       "      <td>admission</td>\n",
       "      <td>language</td>\n",
       "      <td>mark</td>\n",
       "      <td>rank</td>\n",
       "      <td>program</td>\n",
       "      <td>School/Student/College/University</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Topic 19</td>\n",
       "      <td>question</td>\n",
       "      <td>quora</td>\n",
       "      <td>answer</td>\n",
       "      <td>interview</td>\n",
       "      <td>ask</td>\n",
       "      <td>topic</td>\n",
       "      <td>paper</td>\n",
       "      <td>type</td>\n",
       "      <td>account</td>\n",
       "      <td>number</td>\n",
       "      <td>...</td>\n",
       "      <td>comment</td>\n",
       "      <td>view</td>\n",
       "      <td>writer</td>\n",
       "      <td>site</td>\n",
       "      <td>user</td>\n",
       "      <td>exam</td>\n",
       "      <td>kind</td>\n",
       "      <td>feed</td>\n",
       "      <td>test</td>\n",
       "      <td>Question/Answer/Quora/Interview</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>20 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              Word 0      Word 1       Word 2        Word 3       Word 4  \\\n",
       "Topic 0          nan          ab          phd    phenomenon   philippine   \n",
       "Topic 1       people      reason        money         group       friend   \n",
       "Topic 2         life      change       moment       meaning   experience   \n",
       "Topic 3          way       money       online      language     business   \n",
       "Topic 4        thing     weekend       parent         today         mind   \n",
       "Topic 5         time      period       travel           sex        movie   \n",
       "Topic 6          use    language         word           app         type   \n",
       "Topic 7         year        girl   experience           age         date   \n",
       "Topic 8        india     company        state         money     business   \n",
       "Topic 9       person      friend         love  relationship         kind   \n",
       "Topic 10  difference  similarity        state          word         term   \n",
       "Topic 11     country       state        china           war   government   \n",
       "Topic 12        work         tip      company    experience         home   \n",
       "Topic 13        book       movie  preparation         class       device   \n",
       "Topic 14         job     company  engineering      engineer   experience   \n",
       "Topic 15       woman         man         girl           sex          guy   \n",
       "Topic 16       world         war        place           cup        today   \n",
       "Topic 17         day      period         hour          week        month   \n",
       "Topic 18      school     student      college    university  engineering   \n",
       "Topic 19    question       quora       answer     interview          ask   \n",
       "\n",
       "               Word 5        Word 6        Word 7      Word 8      Word 9  \\\n",
       "Topic 0   philosopher    philosophy         phone       photo  photograph   \n",
       "Topic 1           lot  relationship          word     problem         age   \n",
       "Topic 2       purpose       example         point        love        rest   \n",
       "Topic 3          home          girl        friend      weight      number   \n",
       "Topic 4         child        friend          need        love         guy   \n",
       "Topic 5        friend         waste         money       month       space   \n",
       "Topic 6     technique         phone        device    software      number   \n",
       "Topic 7           guy  relationship           boy       month      change   \n",
       "Topic 8         place         scope          cost  government      course   \n",
       "Topic 9          date         dream       message     history        talk   \n",
       "Topic 10     language            vs   engineering      number    computer   \n",
       "Topic 11       europe       citizen      language     america     compare   \n",
       "Topic 12     engineer      software          hour    business        visa   \n",
       "Topic 13         exam       history     character        read     inspire   \n",
       "Topic 14  opportunity    government           pay      degree    software   \n",
       "Topic 15       friend          date  relationship        love         age   \n",
       "Topic 16     language       history          look        city      change   \n",
       "Topic 17          sex         place         night       water        exam   \n",
       "Topic 18        class         study       science      course    business   \n",
       "Topic 19        topic         paper          type     account      number   \n",
       "\n",
       "          ...      Word 11   Word 12    Word 13         Word 14       Word 15  \\\n",
       "Topic 0   ...  photography    photon  photoshop  photosynthesis           php   \n",
       "Topic 1   ...         kind  religion       talk           think         state   \n",
       "Topic 2   ...        death     event      story            goal        friend   \n",
       "Topic 3   ...          app     child      month          market       account   \n",
       "Topic 4   ...     internet      girl     happen           learn       teacher   \n",
       "Topic 5   ...         week    change      study             lot  relationship   \n",
       "Topic 6   ...        water    method       tool         website           oil   \n",
       "Topic 7   ...        money      exam       drop           start       company   \n",
       "Topic 8   ...     pakistan   service        car          online      minister   \n",
       "Topic 9   ...  personality    reason       word           phone         money   \n",
       "Topic 10  ...      science   culture    example           force          love   \n",
       "Topic 11  ...   population   culture    economy          canada        russia   \n",
       "Topic 12  ...     employee   project   industry           field         money   \n",
       "Topic 13  ...        theme  beginner    physics       chemistry         story   \n",
       "Topic 14  ...    interview    course      money            kind      graduate   \n",
       "Topic 15  ...         look     child   marriage            hair       husband   \n",
       "Topic 16  ...         rest     power    company      population         state   \n",
       "Topic 17  ...        today    weight       test             eat         study   \n",
       "Topic 18  ...       friend     state       exam           money     admission   \n",
       "Topic 19  ...      comment      view     writer            site          user   \n",
       "\n",
       "            Word 16    Word 17     Word 18   Word 19  \\\n",
       "Topic 0      physic  physician   physicist   physics   \n",
       "Topic 1        love     family        look    medium   \n",
       "Topic 2       dream      earth       movie    parent   \n",
       "Topic 3       phone        guy     company      hair   \n",
       "Topic 4         lot       kind      family     money   \n",
       "Topic 5       place       hour         guy    number   \n",
       "Topic 6   advantage      datum  technology   company   \n",
       "Topic 7       paper        gap      course    parent   \n",
       "Topic 8     product       city     college      bank   \n",
       "Topic 9      change        age         sex      type   \n",
       "Topic 10      power      point        type  function   \n",
       "Topic 11       rate     africa        visa    travel   \n",
       "Topic 12       bank     canada   developer    permit   \n",
       "Topic 13    science     series     setting      math   \n",
       "Topic 14   computer      field      career   science   \n",
       "Topic 15        boy      marry       movie      body   \n",
       "Topic 16         ii      money      leader      game   \n",
       "Topic 17      cause       food        body      plan   \n",
       "Topic 18   language       mark        rank   program   \n",
       "Topic 19       exam       kind        feed      test   \n",
       "\n",
       "                                                topic_theme  \n",
       "Topic 0                                  Word start from ph  \n",
       "Topic 1                          People/Friend/Relationship  \n",
       "Topic 2                        Life/Experience/Love/Purpose  \n",
       "Topic 3                             Money/Internet/Business  \n",
       "Topic 4                                Weekend/Parent/Child  \n",
       "Topic 5                                        Leisure time  \n",
       "Topic 6                         Language/technique/software  \n",
       "Topic 7                               Relationship/Girl/Boy  \n",
       "Topic 8         Business relate to India, China or Pakistan  \n",
       "Topic 9                            Friend/Love/Relationship  \n",
       "Topic 10     Difference and similarity/Language/Engineering  \n",
       "Topic 11  Culture, travel and visa requirements in sever...  \n",
       "Topic 12            Tips on working as software engineering  \n",
       "Topic 13  Book/Movie/Class/History/Physics/Chemistry/Sci...  \n",
       "Topic 14       Software engineer job opportunitis in Canada  \n",
       "Topic 15                             Love/Life/Relationship  \n",
       "Topic 16                         World/War/Language/History  \n",
       "Topic 17                      Day/Hour/Week/Month/Sex/Place  \n",
       "Topic 18                  School/Student/College/University  \n",
       "Topic 19                    Question/Answer/Quora/Interview  \n",
       "\n",
       "[20 rows x 21 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_topic_keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_topic_keywords.set_index('topic_theme', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>topic_theme</th>\n",
       "      <th>Word start from ph</th>\n",
       "      <th>People/Friend/Relationship</th>\n",
       "      <th>Life/Experience/Love/Purpose</th>\n",
       "      <th>Money/Internet/Business</th>\n",
       "      <th>Weekend/Parent/Child</th>\n",
       "      <th>Leisure time</th>\n",
       "      <th>Language/technique/software</th>\n",
       "      <th>Relationship/Girl/Boy</th>\n",
       "      <th>Business relate to India, China or Pakistan</th>\n",
       "      <th>Friend/Love/Relationship</th>\n",
       "      <th>Difference and similarity/Language/Engineering</th>\n",
       "      <th>Culture, travel and visa requirements in several countries</th>\n",
       "      <th>Tips on working as software engineering</th>\n",
       "      <th>Book/Movie/Class/History/Physics/Chemistry/Science</th>\n",
       "      <th>Software engineer job opportunitis in Canada</th>\n",
       "      <th>Love/Life/Relationship</th>\n",
       "      <th>World/War/Language/History</th>\n",
       "      <th>Day/Hour/Week/Month/Sex/Place</th>\n",
       "      <th>School/Student/College/University</th>\n",
       "      <th>Question/Answer/Quora/Interview</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>Word 0</td>\n",
       "      <td>nan</td>\n",
       "      <td>people</td>\n",
       "      <td>life</td>\n",
       "      <td>way</td>\n",
       "      <td>thing</td>\n",
       "      <td>time</td>\n",
       "      <td>use</td>\n",
       "      <td>year</td>\n",
       "      <td>india</td>\n",
       "      <td>person</td>\n",
       "      <td>difference</td>\n",
       "      <td>country</td>\n",
       "      <td>work</td>\n",
       "      <td>book</td>\n",
       "      <td>job</td>\n",
       "      <td>woman</td>\n",
       "      <td>world</td>\n",
       "      <td>day</td>\n",
       "      <td>school</td>\n",
       "      <td>question</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 1</td>\n",
       "      <td>ab</td>\n",
       "      <td>reason</td>\n",
       "      <td>change</td>\n",
       "      <td>money</td>\n",
       "      <td>weekend</td>\n",
       "      <td>period</td>\n",
       "      <td>language</td>\n",
       "      <td>girl</td>\n",
       "      <td>company</td>\n",
       "      <td>friend</td>\n",
       "      <td>similarity</td>\n",
       "      <td>state</td>\n",
       "      <td>tip</td>\n",
       "      <td>movie</td>\n",
       "      <td>company</td>\n",
       "      <td>man</td>\n",
       "      <td>war</td>\n",
       "      <td>period</td>\n",
       "      <td>student</td>\n",
       "      <td>quora</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 2</td>\n",
       "      <td>phd</td>\n",
       "      <td>money</td>\n",
       "      <td>moment</td>\n",
       "      <td>online</td>\n",
       "      <td>parent</td>\n",
       "      <td>travel</td>\n",
       "      <td>word</td>\n",
       "      <td>experience</td>\n",
       "      <td>state</td>\n",
       "      <td>love</td>\n",
       "      <td>state</td>\n",
       "      <td>china</td>\n",
       "      <td>company</td>\n",
       "      <td>preparation</td>\n",
       "      <td>engineering</td>\n",
       "      <td>girl</td>\n",
       "      <td>place</td>\n",
       "      <td>hour</td>\n",
       "      <td>college</td>\n",
       "      <td>answer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 3</td>\n",
       "      <td>phenomenon</td>\n",
       "      <td>group</td>\n",
       "      <td>meaning</td>\n",
       "      <td>language</td>\n",
       "      <td>today</td>\n",
       "      <td>sex</td>\n",
       "      <td>app</td>\n",
       "      <td>age</td>\n",
       "      <td>money</td>\n",
       "      <td>relationship</td>\n",
       "      <td>word</td>\n",
       "      <td>war</td>\n",
       "      <td>experience</td>\n",
       "      <td>class</td>\n",
       "      <td>engineer</td>\n",
       "      <td>sex</td>\n",
       "      <td>cup</td>\n",
       "      <td>week</td>\n",
       "      <td>university</td>\n",
       "      <td>interview</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 4</td>\n",
       "      <td>philippine</td>\n",
       "      <td>friend</td>\n",
       "      <td>experience</td>\n",
       "      <td>business</td>\n",
       "      <td>mind</td>\n",
       "      <td>movie</td>\n",
       "      <td>type</td>\n",
       "      <td>date</td>\n",
       "      <td>business</td>\n",
       "      <td>kind</td>\n",
       "      <td>term</td>\n",
       "      <td>government</td>\n",
       "      <td>home</td>\n",
       "      <td>device</td>\n",
       "      <td>experience</td>\n",
       "      <td>guy</td>\n",
       "      <td>today</td>\n",
       "      <td>month</td>\n",
       "      <td>engineering</td>\n",
       "      <td>ask</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 5</td>\n",
       "      <td>philosopher</td>\n",
       "      <td>lot</td>\n",
       "      <td>purpose</td>\n",
       "      <td>home</td>\n",
       "      <td>child</td>\n",
       "      <td>friend</td>\n",
       "      <td>technique</td>\n",
       "      <td>guy</td>\n",
       "      <td>place</td>\n",
       "      <td>date</td>\n",
       "      <td>language</td>\n",
       "      <td>europe</td>\n",
       "      <td>engineer</td>\n",
       "      <td>exam</td>\n",
       "      <td>opportunity</td>\n",
       "      <td>friend</td>\n",
       "      <td>language</td>\n",
       "      <td>sex</td>\n",
       "      <td>class</td>\n",
       "      <td>topic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 6</td>\n",
       "      <td>philosophy</td>\n",
       "      <td>relationship</td>\n",
       "      <td>example</td>\n",
       "      <td>girl</td>\n",
       "      <td>friend</td>\n",
       "      <td>waste</td>\n",
       "      <td>phone</td>\n",
       "      <td>relationship</td>\n",
       "      <td>scope</td>\n",
       "      <td>dream</td>\n",
       "      <td>vs</td>\n",
       "      <td>citizen</td>\n",
       "      <td>software</td>\n",
       "      <td>history</td>\n",
       "      <td>government</td>\n",
       "      <td>date</td>\n",
       "      <td>history</td>\n",
       "      <td>place</td>\n",
       "      <td>study</td>\n",
       "      <td>paper</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 7</td>\n",
       "      <td>phone</td>\n",
       "      <td>word</td>\n",
       "      <td>point</td>\n",
       "      <td>friend</td>\n",
       "      <td>need</td>\n",
       "      <td>money</td>\n",
       "      <td>device</td>\n",
       "      <td>boy</td>\n",
       "      <td>cost</td>\n",
       "      <td>message</td>\n",
       "      <td>engineering</td>\n",
       "      <td>language</td>\n",
       "      <td>hour</td>\n",
       "      <td>character</td>\n",
       "      <td>pay</td>\n",
       "      <td>relationship</td>\n",
       "      <td>look</td>\n",
       "      <td>night</td>\n",
       "      <td>science</td>\n",
       "      <td>type</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 8</td>\n",
       "      <td>photo</td>\n",
       "      <td>problem</td>\n",
       "      <td>love</td>\n",
       "      <td>weight</td>\n",
       "      <td>love</td>\n",
       "      <td>month</td>\n",
       "      <td>software</td>\n",
       "      <td>month</td>\n",
       "      <td>government</td>\n",
       "      <td>history</td>\n",
       "      <td>number</td>\n",
       "      <td>america</td>\n",
       "      <td>business</td>\n",
       "      <td>read</td>\n",
       "      <td>degree</td>\n",
       "      <td>love</td>\n",
       "      <td>city</td>\n",
       "      <td>water</td>\n",
       "      <td>course</td>\n",
       "      <td>account</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 9</td>\n",
       "      <td>photograph</td>\n",
       "      <td>age</td>\n",
       "      <td>rest</td>\n",
       "      <td>number</td>\n",
       "      <td>guy</td>\n",
       "      <td>space</td>\n",
       "      <td>number</td>\n",
       "      <td>change</td>\n",
       "      <td>course</td>\n",
       "      <td>talk</td>\n",
       "      <td>computer</td>\n",
       "      <td>compare</td>\n",
       "      <td>visa</td>\n",
       "      <td>inspire</td>\n",
       "      <td>software</td>\n",
       "      <td>age</td>\n",
       "      <td>change</td>\n",
       "      <td>exam</td>\n",
       "      <td>business</td>\n",
       "      <td>number</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 10</td>\n",
       "      <td>photographer</td>\n",
       "      <td>government</td>\n",
       "      <td>lesson</td>\n",
       "      <td>website</td>\n",
       "      <td>relationship</td>\n",
       "      <td>girl</td>\n",
       "      <td>car</td>\n",
       "      <td>engineering</td>\n",
       "      <td>china</td>\n",
       "      <td>number</td>\n",
       "      <td>cell</td>\n",
       "      <td>law</td>\n",
       "      <td>service</td>\n",
       "      <td>jee</td>\n",
       "      <td>canada</td>\n",
       "      <td>wife</td>\n",
       "      <td>end</td>\n",
       "      <td>trip</td>\n",
       "      <td>computer</td>\n",
       "      <td>post</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 11</td>\n",
       "      <td>photography</td>\n",
       "      <td>kind</td>\n",
       "      <td>death</td>\n",
       "      <td>app</td>\n",
       "      <td>internet</td>\n",
       "      <td>week</td>\n",
       "      <td>water</td>\n",
       "      <td>money</td>\n",
       "      <td>pakistan</td>\n",
       "      <td>personality</td>\n",
       "      <td>science</td>\n",
       "      <td>population</td>\n",
       "      <td>employee</td>\n",
       "      <td>theme</td>\n",
       "      <td>interview</td>\n",
       "      <td>look</td>\n",
       "      <td>rest</td>\n",
       "      <td>today</td>\n",
       "      <td>friend</td>\n",
       "      <td>comment</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 12</td>\n",
       "      <td>photon</td>\n",
       "      <td>religion</td>\n",
       "      <td>event</td>\n",
       "      <td>child</td>\n",
       "      <td>girl</td>\n",
       "      <td>change</td>\n",
       "      <td>method</td>\n",
       "      <td>exam</td>\n",
       "      <td>service</td>\n",
       "      <td>reason</td>\n",
       "      <td>culture</td>\n",
       "      <td>culture</td>\n",
       "      <td>project</td>\n",
       "      <td>beginner</td>\n",
       "      <td>course</td>\n",
       "      <td>child</td>\n",
       "      <td>power</td>\n",
       "      <td>weight</td>\n",
       "      <td>state</td>\n",
       "      <td>view</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 13</td>\n",
       "      <td>photoshop</td>\n",
       "      <td>talk</td>\n",
       "      <td>story</td>\n",
       "      <td>month</td>\n",
       "      <td>happen</td>\n",
       "      <td>study</td>\n",
       "      <td>tool</td>\n",
       "      <td>drop</td>\n",
       "      <td>car</td>\n",
       "      <td>word</td>\n",
       "      <td>example</td>\n",
       "      <td>economy</td>\n",
       "      <td>industry</td>\n",
       "      <td>physics</td>\n",
       "      <td>money</td>\n",
       "      <td>marriage</td>\n",
       "      <td>company</td>\n",
       "      <td>test</td>\n",
       "      <td>exam</td>\n",
       "      <td>writer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 14</td>\n",
       "      <td>photosynthesis</td>\n",
       "      <td>think</td>\n",
       "      <td>goal</td>\n",
       "      <td>market</td>\n",
       "      <td>learn</td>\n",
       "      <td>lot</td>\n",
       "      <td>website</td>\n",
       "      <td>start</td>\n",
       "      <td>online</td>\n",
       "      <td>phone</td>\n",
       "      <td>force</td>\n",
       "      <td>canada</td>\n",
       "      <td>field</td>\n",
       "      <td>chemistry</td>\n",
       "      <td>kind</td>\n",
       "      <td>hair</td>\n",
       "      <td>population</td>\n",
       "      <td>eat</td>\n",
       "      <td>money</td>\n",
       "      <td>site</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 15</td>\n",
       "      <td>php</td>\n",
       "      <td>state</td>\n",
       "      <td>friend</td>\n",
       "      <td>account</td>\n",
       "      <td>teacher</td>\n",
       "      <td>relationship</td>\n",
       "      <td>oil</td>\n",
       "      <td>company</td>\n",
       "      <td>minister</td>\n",
       "      <td>money</td>\n",
       "      <td>love</td>\n",
       "      <td>russia</td>\n",
       "      <td>money</td>\n",
       "      <td>story</td>\n",
       "      <td>graduate</td>\n",
       "      <td>husband</td>\n",
       "      <td>state</td>\n",
       "      <td>study</td>\n",
       "      <td>admission</td>\n",
       "      <td>user</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 16</td>\n",
       "      <td>physic</td>\n",
       "      <td>love</td>\n",
       "      <td>dream</td>\n",
       "      <td>phone</td>\n",
       "      <td>lot</td>\n",
       "      <td>place</td>\n",
       "      <td>advantage</td>\n",
       "      <td>paper</td>\n",
       "      <td>product</td>\n",
       "      <td>change</td>\n",
       "      <td>power</td>\n",
       "      <td>rate</td>\n",
       "      <td>bank</td>\n",
       "      <td>science</td>\n",
       "      <td>computer</td>\n",
       "      <td>boy</td>\n",
       "      <td>ii</td>\n",
       "      <td>cause</td>\n",
       "      <td>language</td>\n",
       "      <td>exam</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 17</td>\n",
       "      <td>physician</td>\n",
       "      <td>family</td>\n",
       "      <td>earth</td>\n",
       "      <td>guy</td>\n",
       "      <td>kind</td>\n",
       "      <td>hour</td>\n",
       "      <td>datum</td>\n",
       "      <td>gap</td>\n",
       "      <td>city</td>\n",
       "      <td>age</td>\n",
       "      <td>point</td>\n",
       "      <td>africa</td>\n",
       "      <td>canada</td>\n",
       "      <td>series</td>\n",
       "      <td>field</td>\n",
       "      <td>marry</td>\n",
       "      <td>money</td>\n",
       "      <td>food</td>\n",
       "      <td>mark</td>\n",
       "      <td>kind</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 18</td>\n",
       "      <td>physicist</td>\n",
       "      <td>look</td>\n",
       "      <td>movie</td>\n",
       "      <td>company</td>\n",
       "      <td>family</td>\n",
       "      <td>guy</td>\n",
       "      <td>technology</td>\n",
       "      <td>course</td>\n",
       "      <td>college</td>\n",
       "      <td>sex</td>\n",
       "      <td>type</td>\n",
       "      <td>visa</td>\n",
       "      <td>developer</td>\n",
       "      <td>setting</td>\n",
       "      <td>career</td>\n",
       "      <td>movie</td>\n",
       "      <td>leader</td>\n",
       "      <td>body</td>\n",
       "      <td>rank</td>\n",
       "      <td>feed</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Word 19</td>\n",
       "      <td>physics</td>\n",
       "      <td>medium</td>\n",
       "      <td>parent</td>\n",
       "      <td>hair</td>\n",
       "      <td>money</td>\n",
       "      <td>number</td>\n",
       "      <td>company</td>\n",
       "      <td>parent</td>\n",
       "      <td>bank</td>\n",
       "      <td>type</td>\n",
       "      <td>function</td>\n",
       "      <td>travel</td>\n",
       "      <td>permit</td>\n",
       "      <td>math</td>\n",
       "      <td>science</td>\n",
       "      <td>body</td>\n",
       "      <td>game</td>\n",
       "      <td>plan</td>\n",
       "      <td>program</td>\n",
       "      <td>test</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "topic_theme Word start from ph People/Friend/Relationship  \\\n",
       "Word 0                     nan                     people   \n",
       "Word 1                      ab                     reason   \n",
       "Word 2                     phd                      money   \n",
       "Word 3              phenomenon                      group   \n",
       "Word 4              philippine                     friend   \n",
       "Word 5             philosopher                        lot   \n",
       "Word 6              philosophy               relationship   \n",
       "Word 7                   phone                       word   \n",
       "Word 8                   photo                    problem   \n",
       "Word 9              photograph                        age   \n",
       "Word 10           photographer                 government   \n",
       "Word 11            photography                       kind   \n",
       "Word 12                 photon                   religion   \n",
       "Word 13              photoshop                       talk   \n",
       "Word 14         photosynthesis                      think   \n",
       "Word 15                    php                      state   \n",
       "Word 16                 physic                       love   \n",
       "Word 17              physician                     family   \n",
       "Word 18              physicist                       look   \n",
       "Word 19                physics                     medium   \n",
       "\n",
       "topic_theme Life/Experience/Love/Purpose Money/Internet/Business  \\\n",
       "Word 0                              life                     way   \n",
       "Word 1                            change                   money   \n",
       "Word 2                            moment                  online   \n",
       "Word 3                           meaning                language   \n",
       "Word 4                        experience                business   \n",
       "Word 5                           purpose                    home   \n",
       "Word 6                           example                    girl   \n",
       "Word 7                             point                  friend   \n",
       "Word 8                              love                  weight   \n",
       "Word 9                              rest                  number   \n",
       "Word 10                           lesson                 website   \n",
       "Word 11                            death                     app   \n",
       "Word 12                            event                   child   \n",
       "Word 13                            story                   month   \n",
       "Word 14                             goal                  market   \n",
       "Word 15                           friend                 account   \n",
       "Word 16                            dream                   phone   \n",
       "Word 17                            earth                     guy   \n",
       "Word 18                            movie                 company   \n",
       "Word 19                           parent                    hair   \n",
       "\n",
       "topic_theme Weekend/Parent/Child  Leisure time Language/technique/software  \\\n",
       "Word 0                     thing          time                         use   \n",
       "Word 1                   weekend        period                    language   \n",
       "Word 2                    parent        travel                        word   \n",
       "Word 3                     today           sex                         app   \n",
       "Word 4                      mind         movie                        type   \n",
       "Word 5                     child        friend                   technique   \n",
       "Word 6                    friend         waste                       phone   \n",
       "Word 7                      need         money                      device   \n",
       "Word 8                      love         month                    software   \n",
       "Word 9                       guy         space                      number   \n",
       "Word 10             relationship          girl                         car   \n",
       "Word 11                 internet          week                       water   \n",
       "Word 12                     girl        change                      method   \n",
       "Word 13                   happen         study                        tool   \n",
       "Word 14                    learn           lot                     website   \n",
       "Word 15                  teacher  relationship                         oil   \n",
       "Word 16                      lot         place                   advantage   \n",
       "Word 17                     kind          hour                       datum   \n",
       "Word 18                   family           guy                  technology   \n",
       "Word 19                    money        number                     company   \n",
       "\n",
       "topic_theme Relationship/Girl/Boy Business relate to India, China or Pakistan  \\\n",
       "Word 0                       year                                       india   \n",
       "Word 1                       girl                                     company   \n",
       "Word 2                 experience                                       state   \n",
       "Word 3                        age                                       money   \n",
       "Word 4                       date                                    business   \n",
       "Word 5                        guy                                       place   \n",
       "Word 6               relationship                                       scope   \n",
       "Word 7                        boy                                        cost   \n",
       "Word 8                      month                                  government   \n",
       "Word 9                     change                                      course   \n",
       "Word 10               engineering                                       china   \n",
       "Word 11                     money                                    pakistan   \n",
       "Word 12                      exam                                     service   \n",
       "Word 13                      drop                                         car   \n",
       "Word 14                     start                                      online   \n",
       "Word 15                   company                                    minister   \n",
       "Word 16                     paper                                     product   \n",
       "Word 17                       gap                                        city   \n",
       "Word 18                    course                                     college   \n",
       "Word 19                    parent                                        bank   \n",
       "\n",
       "topic_theme Friend/Love/Relationship  \\\n",
       "Word 0                        person   \n",
       "Word 1                        friend   \n",
       "Word 2                          love   \n",
       "Word 3                  relationship   \n",
       "Word 4                          kind   \n",
       "Word 5                          date   \n",
       "Word 6                         dream   \n",
       "Word 7                       message   \n",
       "Word 8                       history   \n",
       "Word 9                          talk   \n",
       "Word 10                       number   \n",
       "Word 11                  personality   \n",
       "Word 12                       reason   \n",
       "Word 13                         word   \n",
       "Word 14                        phone   \n",
       "Word 15                        money   \n",
       "Word 16                       change   \n",
       "Word 17                          age   \n",
       "Word 18                          sex   \n",
       "Word 19                         type   \n",
       "\n",
       "topic_theme Difference and similarity/Language/Engineering  \\\n",
       "Word 0                                          difference   \n",
       "Word 1                                          similarity   \n",
       "Word 2                                               state   \n",
       "Word 3                                                word   \n",
       "Word 4                                                term   \n",
       "Word 5                                            language   \n",
       "Word 6                                                  vs   \n",
       "Word 7                                         engineering   \n",
       "Word 8                                              number   \n",
       "Word 9                                            computer   \n",
       "Word 10                                               cell   \n",
       "Word 11                                            science   \n",
       "Word 12                                            culture   \n",
       "Word 13                                            example   \n",
       "Word 14                                              force   \n",
       "Word 15                                               love   \n",
       "Word 16                                              power   \n",
       "Word 17                                              point   \n",
       "Word 18                                               type   \n",
       "Word 19                                           function   \n",
       "\n",
       "topic_theme Culture, travel and visa requirements in several countries  \\\n",
       "Word 0                                                 country           \n",
       "Word 1                                                   state           \n",
       "Word 2                                                   china           \n",
       "Word 3                                                     war           \n",
       "Word 4                                              government           \n",
       "Word 5                                                  europe           \n",
       "Word 6                                                 citizen           \n",
       "Word 7                                                language           \n",
       "Word 8                                                 america           \n",
       "Word 9                                                 compare           \n",
       "Word 10                                                    law           \n",
       "Word 11                                             population           \n",
       "Word 12                                                culture           \n",
       "Word 13                                                economy           \n",
       "Word 14                                                 canada           \n",
       "Word 15                                                 russia           \n",
       "Word 16                                                   rate           \n",
       "Word 17                                                 africa           \n",
       "Word 18                                                   visa           \n",
       "Word 19                                                 travel           \n",
       "\n",
       "topic_theme Tips on working as software engineering  \\\n",
       "Word 0                                         work   \n",
       "Word 1                                          tip   \n",
       "Word 2                                      company   \n",
       "Word 3                                   experience   \n",
       "Word 4                                         home   \n",
       "Word 5                                     engineer   \n",
       "Word 6                                     software   \n",
       "Word 7                                         hour   \n",
       "Word 8                                     business   \n",
       "Word 9                                         visa   \n",
       "Word 10                                     service   \n",
       "Word 11                                    employee   \n",
       "Word 12                                     project   \n",
       "Word 13                                    industry   \n",
       "Word 14                                       field   \n",
       "Word 15                                       money   \n",
       "Word 16                                        bank   \n",
       "Word 17                                      canada   \n",
       "Word 18                                   developer   \n",
       "Word 19                                      permit   \n",
       "\n",
       "topic_theme Book/Movie/Class/History/Physics/Chemistry/Science  \\\n",
       "Word 0                                                    book   \n",
       "Word 1                                                   movie   \n",
       "Word 2                                             preparation   \n",
       "Word 3                                                   class   \n",
       "Word 4                                                  device   \n",
       "Word 5                                                    exam   \n",
       "Word 6                                                 history   \n",
       "Word 7                                               character   \n",
       "Word 8                                                    read   \n",
       "Word 9                                                 inspire   \n",
       "Word 10                                                    jee   \n",
       "Word 11                                                  theme   \n",
       "Word 12                                               beginner   \n",
       "Word 13                                                physics   \n",
       "Word 14                                              chemistry   \n",
       "Word 15                                                  story   \n",
       "Word 16                                                science   \n",
       "Word 17                                                 series   \n",
       "Word 18                                                setting   \n",
       "Word 19                                                   math   \n",
       "\n",
       "topic_theme Software engineer job opportunitis in Canada  \\\n",
       "Word 0                                               job   \n",
       "Word 1                                           company   \n",
       "Word 2                                       engineering   \n",
       "Word 3                                          engineer   \n",
       "Word 4                                        experience   \n",
       "Word 5                                       opportunity   \n",
       "Word 6                                        government   \n",
       "Word 7                                               pay   \n",
       "Word 8                                            degree   \n",
       "Word 9                                          software   \n",
       "Word 10                                           canada   \n",
       "Word 11                                        interview   \n",
       "Word 12                                           course   \n",
       "Word 13                                            money   \n",
       "Word 14                                             kind   \n",
       "Word 15                                         graduate   \n",
       "Word 16                                         computer   \n",
       "Word 17                                            field   \n",
       "Word 18                                           career   \n",
       "Word 19                                          science   \n",
       "\n",
       "topic_theme Love/Life/Relationship World/War/Language/History  \\\n",
       "Word 0                       woman                      world   \n",
       "Word 1                         man                        war   \n",
       "Word 2                        girl                      place   \n",
       "Word 3                         sex                        cup   \n",
       "Word 4                         guy                      today   \n",
       "Word 5                      friend                   language   \n",
       "Word 6                        date                    history   \n",
       "Word 7                relationship                       look   \n",
       "Word 8                        love                       city   \n",
       "Word 9                         age                     change   \n",
       "Word 10                       wife                        end   \n",
       "Word 11                       look                       rest   \n",
       "Word 12                      child                      power   \n",
       "Word 13                   marriage                    company   \n",
       "Word 14                       hair                 population   \n",
       "Word 15                    husband                      state   \n",
       "Word 16                        boy                         ii   \n",
       "Word 17                      marry                      money   \n",
       "Word 18                      movie                     leader   \n",
       "Word 19                       body                       game   \n",
       "\n",
       "topic_theme Day/Hour/Week/Month/Sex/Place School/Student/College/University  \\\n",
       "Word 0                                day                            school   \n",
       "Word 1                             period                           student   \n",
       "Word 2                               hour                           college   \n",
       "Word 3                               week                        university   \n",
       "Word 4                              month                       engineering   \n",
       "Word 5                                sex                             class   \n",
       "Word 6                              place                             study   \n",
       "Word 7                              night                           science   \n",
       "Word 8                              water                            course   \n",
       "Word 9                               exam                          business   \n",
       "Word 10                              trip                          computer   \n",
       "Word 11                             today                            friend   \n",
       "Word 12                            weight                             state   \n",
       "Word 13                              test                              exam   \n",
       "Word 14                               eat                             money   \n",
       "Word 15                             study                         admission   \n",
       "Word 16                             cause                          language   \n",
       "Word 17                              food                              mark   \n",
       "Word 18                              body                              rank   \n",
       "Word 19                              plan                           program   \n",
       "\n",
       "topic_theme Question/Answer/Quora/Interview  \n",
       "Word 0                             question  \n",
       "Word 1                                quora  \n",
       "Word 2                               answer  \n",
       "Word 3                            interview  \n",
       "Word 4                                  ask  \n",
       "Word 5                                topic  \n",
       "Word 6                                paper  \n",
       "Word 7                                 type  \n",
       "Word 8                              account  \n",
       "Word 9                               number  \n",
       "Word 10                                post  \n",
       "Word 11                             comment  \n",
       "Word 12                                view  \n",
       "Word 13                              writer  \n",
       "Word 14                                site  \n",
       "Word 15                                user  \n",
       "Word 16                                exam  \n",
       "Word 17                                kind  \n",
       "Word 18                                feed  \n",
       "Word 19                                test  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_topic_keywords.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Document - Topic Matrix\n",
    "lda_output = nmf.transform(tfidf)\n",
    "\n",
    "# column names\n",
    "topicnames = df_topic_keywords.T.columns\n",
    "# topicnames = [\"Topic\" + str(i) for i in range(20)]\n",
    "\n",
    "# index names\n",
    "docnames = [\"Doc\" + str(i) for i in range(len(df_nouns))]\n",
    "\n",
    "# Make the pandas dataframe\n",
    "df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)\n",
    "\n",
    "# Get dominant topic for each document\n",
    "dominant_topic = np.argmax(df_document_topic.values, axis=1)\n",
    "df_document_topic['dominant_topic'] = dominant_topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>topic_theme</th>\n",
       "      <th>Word start from ph</th>\n",
       "      <th>People/Friend/Relationship</th>\n",
       "      <th>Life/Experience/Love/Purpose</th>\n",
       "      <th>Money/Internet/Business</th>\n",
       "      <th>Weekend/Parent/Child</th>\n",
       "      <th>Leisure time</th>\n",
       "      <th>Language/technique/software</th>\n",
       "      <th>Relationship/Girl/Boy</th>\n",
       "      <th>Business relate to India, China or Pakistan</th>\n",
       "      <th>Friend/Love/Relationship</th>\n",
       "      <th>...</th>\n",
       "      <th>Culture, travel and visa requirements in several countries</th>\n",
       "      <th>Tips on working as software engineering</th>\n",
       "      <th>Book/Movie/Class/History/Physics/Chemistry/Science</th>\n",
       "      <th>Software engineer job opportunitis in Canada</th>\n",
       "      <th>Love/Life/Relationship</th>\n",
       "      <th>World/War/Language/History</th>\n",
       "      <th>Day/Hour/Week/Month/Sex/Place</th>\n",
       "      <th>School/Student/College/University</th>\n",
       "      <th>Question/Answer/Quora/Interview</th>\n",
       "      <th>dominant_topic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>Doc0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc983796</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc983797</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc983798</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc983799</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Doc983800</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>983801 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "topic_theme  Word start from ph  People/Friend/Relationship  \\\n",
       "Doc0                        0.0                        0.00   \n",
       "Doc1                        0.0                        0.03   \n",
       "Doc2                        0.0                        0.00   \n",
       "Doc3                        0.0                        0.00   \n",
       "Doc4                        0.0                        0.00   \n",
       "...                         ...                         ...   \n",
       "Doc983796                   0.0                        0.00   \n",
       "Doc983797                   0.0                        0.00   \n",
       "Doc983798                   0.0                        0.00   \n",
       "Doc983799                   0.0                        0.00   \n",
       "Doc983800                   0.0                        0.00   \n",
       "\n",
       "topic_theme  Life/Experience/Love/Purpose  Money/Internet/Business  \\\n",
       "Doc0                                  0.0                      0.0   \n",
       "Doc1                                  0.0                      0.0   \n",
       "Doc2                                  0.0                      0.0   \n",
       "Doc3                                  0.0                      0.0   \n",
       "Doc4                                  0.0                      0.0   \n",
       "...                                   ...                      ...   \n",
       "Doc983796                             0.0                      0.0   \n",
       "Doc983797                             0.0                      0.0   \n",
       "Doc983798                             0.0                      0.0   \n",
       "Doc983799                             0.0                      0.0   \n",
       "Doc983800                             0.0                      0.0   \n",
       "\n",
       "topic_theme  Weekend/Parent/Child  Leisure time  Language/technique/software  \\\n",
       "Doc0                          0.0          0.00                          0.0   \n",
       "Doc1                          0.0          0.00                          0.0   \n",
       "Doc2                          0.0          0.01                          0.0   \n",
       "Doc3                          0.0          0.00                          0.0   \n",
       "Doc4                          0.0          0.00                          0.0   \n",
       "...                           ...           ...                          ...   \n",
       "Doc983796                     0.0          0.00                          0.0   \n",
       "Doc983797                     0.0          0.00                          0.0   \n",
       "Doc983798                     0.0          0.00                          0.0   \n",
       "Doc983799                     0.0          0.00                          0.0   \n",
       "Doc983800                     0.0          0.00                          0.0   \n",
       "\n",
       "topic_theme  Relationship/Girl/Boy  \\\n",
       "Doc0                           0.0   \n",
       "Doc1                           0.0   \n",
       "Doc2                           0.0   \n",
       "Doc3                           0.0   \n",
       "Doc4                           0.0   \n",
       "...                            ...   \n",
       "Doc983796                      0.0   \n",
       "Doc983797                      0.0   \n",
       "Doc983798                      0.0   \n",
       "Doc983799                      0.0   \n",
       "Doc983800                      0.0   \n",
       "\n",
       "topic_theme  Business relate to India, China or Pakistan  \\\n",
       "Doc0                                                 0.0   \n",
       "Doc1                                                 0.0   \n",
       "Doc2                                                 0.0   \n",
       "Doc3                                                 0.0   \n",
       "Doc4                                                 0.0   \n",
       "...                                                  ...   \n",
       "Doc983796                                            0.0   \n",
       "Doc983797                                            0.0   \n",
       "Doc983798                                            0.0   \n",
       "Doc983799                                            0.0   \n",
       "Doc983800                                            0.0   \n",
       "\n",
       "topic_theme  Friend/Love/Relationship  ...  \\\n",
       "Doc0                              0.0  ...   \n",
       "Doc1                              0.0  ...   \n",
       "Doc2                              0.0  ...   \n",
       "Doc3                              0.0  ...   \n",
       "Doc4                              0.0  ...   \n",
       "...                               ...  ...   \n",
       "Doc983796                         0.0  ...   \n",
       "Doc983797                         0.0  ...   \n",
       "Doc983798                         0.0  ...   \n",
       "Doc983799                         0.0  ...   \n",
       "Doc983800                         0.0  ...   \n",
       "\n",
       "topic_theme  Culture, travel and visa requirements in several countries  \\\n",
       "Doc0                                                       0.0            \n",
       "Doc1                                                       0.0            \n",
       "Doc2                                                       0.0            \n",
       "Doc3                                                       0.0            \n",
       "Doc4                                                       0.0            \n",
       "...                                                        ...            \n",
       "Doc983796                                                  0.0            \n",
       "Doc983797                                                  0.0            \n",
       "Doc983798                                                  0.0            \n",
       "Doc983799                                                  0.0            \n",
       "Doc983800                                                  0.0            \n",
       "\n",
       "topic_theme  Tips on working as software engineering  \\\n",
       "Doc0                                             0.0   \n",
       "Doc1                                             0.0   \n",
       "Doc2                                             0.0   \n",
       "Doc3                                             0.0   \n",
       "Doc4                                             0.0   \n",
       "...                                              ...   \n",
       "Doc983796                                        0.0   \n",
       "Doc983797                                        0.0   \n",
       "Doc983798                                        0.0   \n",
       "Doc983799                                        0.0   \n",
       "Doc983800                                        0.0   \n",
       "\n",
       "topic_theme  Book/Movie/Class/History/Physics/Chemistry/Science  \\\n",
       "Doc0                                                       0.0    \n",
       "Doc1                                                       0.0    \n",
       "Doc2                                                       0.0    \n",
       "Doc3                                                       0.0    \n",
       "Doc4                                                       0.0    \n",
       "...                                                        ...    \n",
       "Doc983796                                                  0.0    \n",
       "Doc983797                                                  0.0    \n",
       "Doc983798                                                  0.0    \n",
       "Doc983799                                                  0.0    \n",
       "Doc983800                                                  0.0    \n",
       "\n",
       "topic_theme  Software engineer job opportunitis in Canada  \\\n",
       "Doc0                                                  0.0   \n",
       "Doc1                                                  0.0   \n",
       "Doc2                                                  0.0   \n",
       "Doc3                                                  0.0   \n",
       "Doc4                                                  0.0   \n",
       "...                                                   ...   \n",
       "Doc983796                                             0.0   \n",
       "Doc983797                                             0.0   \n",
       "Doc983798                                             0.0   \n",
       "Doc983799                                             0.0   \n",
       "Doc983800                                             0.0   \n",
       "\n",
       "topic_theme  Love/Life/Relationship  World/War/Language/History  \\\n",
       "Doc0                            0.0                         0.0   \n",
       "Doc1                            0.0                         0.0   \n",
       "Doc2                            0.0                         0.0   \n",
       "Doc3                            0.0                         0.0   \n",
       "Doc4                            0.0                         0.0   \n",
       "...                             ...                         ...   \n",
       "Doc983796                       0.0                         0.0   \n",
       "Doc983797                       0.0                         0.0   \n",
       "Doc983798                       0.0                         0.0   \n",
       "Doc983799                       0.0                         0.0   \n",
       "Doc983800                       0.0                         0.0   \n",
       "\n",
       "topic_theme  Day/Hour/Week/Month/Sex/Place  School/Student/College/University  \\\n",
       "Doc0                                   0.0                                0.0   \n",
       "Doc1                                   0.0                                0.0   \n",
       "Doc2                                   0.0                                0.0   \n",
       "Doc3                                   0.0                                0.0   \n",
       "Doc4                                   0.0                                0.0   \n",
       "...                                    ...                                ...   \n",
       "Doc983796                              0.0                                0.0   \n",
       "Doc983797                              0.0                                0.0   \n",
       "Doc983798                              0.0                                0.0   \n",
       "Doc983799                              0.0                                0.0   \n",
       "Doc983800                              0.0                                0.0   \n",
       "\n",
       "topic_theme  Question/Answer/Quora/Interview  dominant_topic  \n",
       "Doc0                                     0.0               0  \n",
       "Doc1                                     0.0               1  \n",
       "Doc2                                     0.0               5  \n",
       "Doc3                                     0.0               0  \n",
       "Doc4                                     0.0               0  \n",
       "...                                      ...             ...  \n",
       "Doc983796                                0.0               0  \n",
       "Doc983797                                0.0               0  \n",
       "Doc983798                                0.0               0  \n",
       "Doc983799                                0.0              10  \n",
       "Doc983800                                0.0               0  \n",
       "\n",
       "[983801 rows x 21 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_document_topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_document_topic.reset_index(inplace=True)\n",
    "df_sent_topic= pd.merge(df_nouns, df_document_topic, left_index=True, right_index=True)\n",
    "df_sent_topic.drop('index', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "      <th>Word start from ph</th>\n",
       "      <th>People/Friend/Relationship</th>\n",
       "      <th>Life/Experience/Love/Purpose</th>\n",
       "      <th>Money/Internet/Business</th>\n",
       "      <th>Weekend/Parent/Child</th>\n",
       "      <th>Leisure time</th>\n",
       "      <th>Language/technique/software</th>\n",
       "      <th>Relationship/Girl/Boy</th>\n",
       "      <th>Business relate to India, China or Pakistan</th>\n",
       "      <th>...</th>\n",
       "      <th>Culture, travel and visa requirements in several countries</th>\n",
       "      <th>Tips on working as software engineering</th>\n",
       "      <th>Book/Movie/Class/History/Physics/Chemistry/Science</th>\n",
       "      <th>Software engineer job opportunitis in Canada</th>\n",
       "      <th>Love/Life/Relationship</th>\n",
       "      <th>World/War/Language/History</th>\n",
       "      <th>Day/Hour/Week/Month/Sex/Place</th>\n",
       "      <th>School/Student/College/University</th>\n",
       "      <th>Question/Answer/Quora/Interview</th>\n",
       "      <th>dominant_topic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nationalist province nation</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do adopt dog people</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>velocity time velocity space geometry</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>guericke magdeburg hemisphere</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>helicon d mountain bike tyre</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983796</td>\n",
       "      <td>facebook page page</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983797</td>\n",
       "      <td>something</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983798</td>\n",
       "      <td>cycle women cycle</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983799</td>\n",
       "      <td>difference currency note rs rs currency note r...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983800</td>\n",
       "      <td>form dml</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>983801 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 question_lemmatize_clean  Word start from ph  \\\n",
       "0                             nationalist province nation                 0.0   \n",
       "1                                     do adopt dog people                 0.0   \n",
       "2                   velocity time velocity space geometry                 0.0   \n",
       "3                           guericke magdeburg hemisphere                 0.0   \n",
       "4                            helicon d mountain bike tyre                 0.0   \n",
       "...                                                   ...                 ...   \n",
       "983796                                 facebook page page                 0.0   \n",
       "983797                                          something                 0.0   \n",
       "983798                                  cycle women cycle                 0.0   \n",
       "983799  difference currency note rs rs currency note r...                 0.0   \n",
       "983800                                           form dml                 0.0   \n",
       "\n",
       "        People/Friend/Relationship  Life/Experience/Love/Purpose  \\\n",
       "0                             0.00                           0.0   \n",
       "1                             0.03                           0.0   \n",
       "2                             0.00                           0.0   \n",
       "3                             0.00                           0.0   \n",
       "4                             0.00                           0.0   \n",
       "...                            ...                           ...   \n",
       "983796                        0.00                           0.0   \n",
       "983797                        0.00                           0.0   \n",
       "983798                        0.00                           0.0   \n",
       "983799                        0.00                           0.0   \n",
       "983800                        0.00                           0.0   \n",
       "\n",
       "        Money/Internet/Business  Weekend/Parent/Child  Leisure time  \\\n",
       "0                           0.0                   0.0          0.00   \n",
       "1                           0.0                   0.0          0.00   \n",
       "2                           0.0                   0.0          0.01   \n",
       "3                           0.0                   0.0          0.00   \n",
       "4                           0.0                   0.0          0.00   \n",
       "...                         ...                   ...           ...   \n",
       "983796                      0.0                   0.0          0.00   \n",
       "983797                      0.0                   0.0          0.00   \n",
       "983798                      0.0                   0.0          0.00   \n",
       "983799                      0.0                   0.0          0.00   \n",
       "983800                      0.0                   0.0          0.00   \n",
       "\n",
       "        Language/technique/software  Relationship/Girl/Boy  \\\n",
       "0                               0.0                    0.0   \n",
       "1                               0.0                    0.0   \n",
       "2                               0.0                    0.0   \n",
       "3                               0.0                    0.0   \n",
       "4                               0.0                    0.0   \n",
       "...                             ...                    ...   \n",
       "983796                          0.0                    0.0   \n",
       "983797                          0.0                    0.0   \n",
       "983798                          0.0                    0.0   \n",
       "983799                          0.0                    0.0   \n",
       "983800                          0.0                    0.0   \n",
       "\n",
       "        Business relate to India, China or Pakistan  ...  \\\n",
       "0                                               0.0  ...   \n",
       "1                                               0.0  ...   \n",
       "2                                               0.0  ...   \n",
       "3                                               0.0  ...   \n",
       "4                                               0.0  ...   \n",
       "...                                             ...  ...   \n",
       "983796                                          0.0  ...   \n",
       "983797                                          0.0  ...   \n",
       "983798                                          0.0  ...   \n",
       "983799                                          0.0  ...   \n",
       "983800                                          0.0  ...   \n",
       "\n",
       "        Culture, travel and visa requirements in several countries  \\\n",
       "0                                                     0.0            \n",
       "1                                                     0.0            \n",
       "2                                                     0.0            \n",
       "3                                                     0.0            \n",
       "4                                                     0.0            \n",
       "...                                                   ...            \n",
       "983796                                                0.0            \n",
       "983797                                                0.0            \n",
       "983798                                                0.0            \n",
       "983799                                                0.0            \n",
       "983800                                                0.0            \n",
       "\n",
       "        Tips on working as software engineering  \\\n",
       "0                                           0.0   \n",
       "1                                           0.0   \n",
       "2                                           0.0   \n",
       "3                                           0.0   \n",
       "4                                           0.0   \n",
       "...                                         ...   \n",
       "983796                                      0.0   \n",
       "983797                                      0.0   \n",
       "983798                                      0.0   \n",
       "983799                                      0.0   \n",
       "983800                                      0.0   \n",
       "\n",
       "        Book/Movie/Class/History/Physics/Chemistry/Science  \\\n",
       "0                                                     0.0    \n",
       "1                                                     0.0    \n",
       "2                                                     0.0    \n",
       "3                                                     0.0    \n",
       "4                                                     0.0    \n",
       "...                                                   ...    \n",
       "983796                                                0.0    \n",
       "983797                                                0.0    \n",
       "983798                                                0.0    \n",
       "983799                                                0.0    \n",
       "983800                                                0.0    \n",
       "\n",
       "        Software engineer job opportunitis in Canada  Love/Life/Relationship  \\\n",
       "0                                                0.0                     0.0   \n",
       "1                                                0.0                     0.0   \n",
       "2                                                0.0                     0.0   \n",
       "3                                                0.0                     0.0   \n",
       "4                                                0.0                     0.0   \n",
       "...                                              ...                     ...   \n",
       "983796                                           0.0                     0.0   \n",
       "983797                                           0.0                     0.0   \n",
       "983798                                           0.0                     0.0   \n",
       "983799                                           0.0                     0.0   \n",
       "983800                                           0.0                     0.0   \n",
       "\n",
       "        World/War/Language/History  Day/Hour/Week/Month/Sex/Place  \\\n",
       "0                              0.0                            0.0   \n",
       "1                              0.0                            0.0   \n",
       "2                              0.0                            0.0   \n",
       "3                              0.0                            0.0   \n",
       "4                              0.0                            0.0   \n",
       "...                            ...                            ...   \n",
       "983796                         0.0                            0.0   \n",
       "983797                         0.0                            0.0   \n",
       "983798                         0.0                            0.0   \n",
       "983799                         0.0                            0.0   \n",
       "983800                         0.0                            0.0   \n",
       "\n",
       "        School/Student/College/University  Question/Answer/Quora/Interview  \\\n",
       "0                                     0.0                              0.0   \n",
       "1                                     0.0                              0.0   \n",
       "2                                     0.0                              0.0   \n",
       "3                                     0.0                              0.0   \n",
       "4                                     0.0                              0.0   \n",
       "...                                   ...                              ...   \n",
       "983796                                0.0                              0.0   \n",
       "983797                                0.0                              0.0   \n",
       "983798                                0.0                              0.0   \n",
       "983799                                0.0                              0.0   \n",
       "983800                                0.0                              0.0   \n",
       "\n",
       "        dominant_topic  \n",
       "0                    0  \n",
       "1                    1  \n",
       "2                    5  \n",
       "3                    0  \n",
       "4                    0  \n",
       "...                ...  \n",
       "983796               0  \n",
       "983797               0  \n",
       "983798               0  \n",
       "983799              10  \n",
       "983800               0  \n",
       "\n",
       "[983801 rows x 22 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sent_topic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_topic_theme = df_sent_topic[['question_lemmatize_clean', 'dominant_topic']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "      <th>dominant_topic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>nationalist province nation</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>do adopt dog people</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>velocity time velocity space geometry</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>guericke magdeburg hemisphere</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>helicon d mountain bike tyre</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>dachau treblinka</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>opinion report view</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>thing dress dress</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>phase people feelingslive something way thing ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            question_lemmatize_clean  dominant_topic\n",
       "0                        nationalist province nation               0\n",
       "1                                do adopt dog people               1\n",
       "2              velocity time velocity space geometry               5\n",
       "3                      guericke magdeburg hemisphere               0\n",
       "4                       helicon d mountain bike tyre               0\n",
       "5                                   dachau treblinka               0\n",
       "6                                opinion report view               0\n",
       "7                                                NaN               0\n",
       "8                                  thing dress dress               4\n",
       "9  phase people feelingslive something way thing ...               1"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_topic_theme.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/tljh/user/lib/python3.6/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    }
   ],
   "source": [
    "def label_theme(row):\n",
    "    if row['dominant_topic'] == 0 :\n",
    "        return 'Word start from ph'\n",
    "    if row['dominant_topic'] == 1 :\n",
    "        return 'People/Friend/Relationship'\n",
    "    if row['dominant_topic'] == 2 :\n",
    "        return 'Life/Experience/Love/Purpose'\n",
    "    if row['dominant_topic'] == 3:\n",
    "        return 'Money/Internet/Business'\n",
    "    if row['dominant_topic']  == 4:\n",
    "        return 'Weekend/Parent/Child'\n",
    "    if row['dominant_topic'] == 5:\n",
    "        return 'Leisure time'\n",
    "    if row['dominant_topic'] == 6:\n",
    "        return 'Language/technique/software'\n",
    "    if row['dominant_topic'] == 7:\n",
    "        return 'Relationship/Girl/Boy'\n",
    "    if row['dominant_topic'] == 8:\n",
    "        return 'Business relate to India, China or Pakistan'\n",
    "    if row['dominant_topic'] == 9:\n",
    "        return 'Friend/Love/Relationship'\n",
    "    if row['dominant_topic'] == 10:\n",
    "        return 'Difference and similarity/Language/Engineering'\n",
    "    if row['dominant_topic'] == 11:\n",
    "        return 'Culture, travel and visa requirements in several countries'\n",
    "    if row['dominant_topic'] == 12:\n",
    "        return 'Tips on working as software engineering'\n",
    "    if row['dominant_topic'] == 13:\n",
    "        return 'Book/Movie/Class/History/Physics/Chemistry/Science'\n",
    "    if row['dominant_topic'] == 14:\n",
    "        return 'Software engineer job opportunitis in Canada'\n",
    "    if row['dominant_topic'] == 15:\n",
    "        return 'Love/Life/Relationship'\n",
    "    if row['dominant_topic'] == 16:\n",
    "        return 'World/War/Language/History'\n",
    "    if row['dominant_topic'] == 17:\n",
    "        return 'Day/Hour/Week/Month/Sex/Place'\n",
    "    if row['dominant_topic'] == 18:\n",
    "        return 'School/Student/College/University'\n",
    "    if row['dominant_topic'] == 19:\n",
    "        return 'Question/Answer/Quora/Interview'\n",
    "df_topic_theme['dominant_topic_theme'] = df_topic_theme.apply (lambda row: label_theme(row), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question_lemmatize_clean</th>\n",
       "      <th>dominant_topic</th>\n",
       "      <th>dominant_topic_theme</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>983786</td>\n",
       "      <td>anyone chevrolet car chevrolet exit experience</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983787</td>\n",
       "      <td>phone i weekend weekday</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983788</td>\n",
       "      <td>series doctor</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983789</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983790</td>\n",
       "      <td>country continent</td>\n",
       "      <td>11</td>\n",
       "      <td>Culture, travel and visa requirements in sever...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983791</td>\n",
       "      <td>statistic use operation research</td>\n",
       "      <td>6</td>\n",
       "      <td>Language/technique/software</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983792</td>\n",
       "      <td>leak gasket engine</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983793</td>\n",
       "      <td>opportunity manager marketing operation backgr...</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983794</td>\n",
       "      <td>woman chess player</td>\n",
       "      <td>15</td>\n",
       "      <td>Love/Life/Relationship</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983795</td>\n",
       "      <td>college</td>\n",
       "      <td>18</td>\n",
       "      <td>School/Student/College/University</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983796</td>\n",
       "      <td>facebook page page</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983797</td>\n",
       "      <td>something</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983798</td>\n",
       "      <td>cycle women cycle</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983799</td>\n",
       "      <td>difference currency note rs rs currency note r...</td>\n",
       "      <td>10</td>\n",
       "      <td>Difference and similarity/Language/Engineering</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>983800</td>\n",
       "      <td>form dml</td>\n",
       "      <td>0</td>\n",
       "      <td>Word start from ph</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 question_lemmatize_clean  dominant_topic  \\\n",
       "983786     anyone chevrolet car chevrolet exit experience               0   \n",
       "983787                            phone i weekend weekday               0   \n",
       "983788                                      series doctor               0   \n",
       "983789                                                NaN               0   \n",
       "983790                                  country continent              11   \n",
       "983791                   statistic use operation research               6   \n",
       "983792                                 leak gasket engine               0   \n",
       "983793  opportunity manager marketing operation backgr...               0   \n",
       "983794                                 woman chess player              15   \n",
       "983795                                            college              18   \n",
       "983796                                 facebook page page               0   \n",
       "983797                                          something               0   \n",
       "983798                                  cycle women cycle               0   \n",
       "983799  difference currency note rs rs currency note r...              10   \n",
       "983800                                           form dml               0   \n",
       "\n",
       "                                     dominant_topic_theme  \n",
       "983786                                 Word start from ph  \n",
       "983787                                 Word start from ph  \n",
       "983788                                 Word start from ph  \n",
       "983789                                 Word start from ph  \n",
       "983790  Culture, travel and visa requirements in sever...  \n",
       "983791                        Language/technique/software  \n",
       "983792                                 Word start from ph  \n",
       "983793                                 Word start from ph  \n",
       "983794                             Love/Life/Relationship  \n",
       "983795                  School/Student/College/University  \n",
       "983796                                 Word start from ph  \n",
       "983797                                 Word start from ph  \n",
       "983798                                 Word start from ph  \n",
       "983799     Difference and similarity/Language/Engineering  \n",
       "983800                                 Word start from ph  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_topic_theme.tail(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('quora_challenge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'How did Otto von Guericke used the Magdeburg hemispheres?'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['question_text'][3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What is the dumbest, yet possibly true explanation for Trump being elected?'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['question_text'][12]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenize and Clean-up using gensim’s simple_preprocess\n",
    "def sent_to_words(sentences):\n",
    "    for sentence in sentences:\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
    "\n",
    "# Lemmatization and remove pronouns\n",
    "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):\n",
    "    \"\"\"https://spacy.io/api/annotation\"\"\"\n",
    "    texts_out = []\n",
    "    for sent in texts:\n",
    "        doc = nlp(\" \".join(sent)) \n",
    "        texts_out.append(\" \".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))\n",
    "    return texts_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['school', 'student', 'college', 'university', 'engineering', 'class', 'study', 'science', 'course', 'business', 'computer', 'friend', 'state', 'exam', 'money', 'admission', 'language', 'mark', 'rank', 'program']\n"
     ]
    }
   ],
   "source": [
    "# Initialize spacy 'en' model, keeping only tagger component (for efficiency)\n",
    "# We will not assign dependency labels, and we will not lable named entities. And we need pos. \n",
    "\n",
    "nlp = spacy.load('en', disable=['parser', 'ner'])\n",
    "\n",
    "# Define function to predict topic for a given new question.\n",
    "def predict_topic(text, nlp=nlp):\n",
    "    global sent_to_words\n",
    "    global lemmatization\n",
    "\n",
    "    # Step 1: Clean with simple_preprocess\n",
    "    mytext_2 = list(sent_to_words(text))\n",
    "\n",
    "    # Step 2: Lemmatize\n",
    "    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])\n",
    "\n",
    "    # Step 3: Vectorize transform\n",
    "    mytext_4 = tfidf_vectorizer.transform(mytext_3)\n",
    "\n",
    "    # Step 4: LDA Transform\n",
    "    topic_probability_scores = nmf.transform(mytext_4)\n",
    "    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()\n",
    "    return topic, topic_probability_scores\n",
    "\n",
    "# Predict the topic\n",
    "mytext = [\"Just like Larry Page and Sergey Brin unseated their incumbents with a better search engine, how likely is it that two Computer Science PhD students create a search engine that unseats Google? How vulnerable is Google to this possibility\"]\n",
    "topic, prob_scores = predict_topic(text = mytext)\n",
    "print(topic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['school',\n",
       "  'student',\n",
       "  'college',\n",
       "  'university',\n",
       "  'engineering',\n",
       "  'class',\n",
       "  'study',\n",
       "  'science',\n",
       "  'course',\n",
       "  'business',\n",
       "  'computer',\n",
       "  'friend',\n",
       "  'state',\n",
       "  'exam',\n",
       "  'money',\n",
       "  'admission',\n",
       "  'language',\n",
       "  'mark',\n",
       "  'rank',\n",
       "  'program'],\n",
       " array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n",
       "         0.00000000e+00, 0.00000000e+00, 1.89518728e-04, 0.00000000e+00,\n",
       "         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n",
       "         0.00000000e+00, 1.04084621e-05, 4.92109835e-04, 0.00000000e+00,\n",
       "         0.00000000e+00, 0.00000000e+00, 5.56300193e-03, 3.80650193e-07]]))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic, prob_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Word 0          school\n",
       "Word 1         student\n",
       "Word 2         college\n",
       "Word 3      university\n",
       "Word 4     engineering\n",
       "Word 5           class\n",
       "Word 6           study\n",
       "Word 7         science\n",
       "Word 8          course\n",
       "Word 9        business\n",
       "Word 10       computer\n",
       "Word 11         friend\n",
       "Word 12          state\n",
       "Word 13           exam\n",
       "Word 14          money\n",
       "Word 15      admission\n",
       "Word 16       language\n",
       "Word 17           mark\n",
       "Word 18           rank\n",
       "Word 19        program\n",
       "Name: School/Student/College/University, dtype: object"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_topic_keywords.T['School/Student/College/University']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
